diff --git a/config b/config index 247adc5..4947ee8 100644 --- a/config +++ b/config @@ -127,9 +127,7 @@ CONFIG_BPF_JIT=y CONFIG_BPF_JIT_ALWAYS_ON=y CONFIG_BPF_JIT_DEFAULT_ON=y CONFIG_BPF_UNPRIV_DEFAULT_OFF=y -CONFIG_USERMODE_DRIVER=y -CONFIG_BPF_PRELOAD=y -CONFIG_BPF_PRELOAD_UMD=m +# CONFIG_BPF_PRELOAD is not set CONFIG_BPF_LSM=y # end of BPF subsystem @@ -187,7 +185,7 @@ CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y # CONFIG_RCU_NOCB_CPU_CB_BOOST is not set # CONFIG_TASKS_TRACE_RCU_READ_MB is not set CONFIG_RCU_LAZY=y -# CONFIG_RCU_DOUBLE_CHECK_CB_TIME is not set +CONFIG_RCU_DOUBLE_CHECK_CB_TIME=y # end of RCU Subsystem CONFIG_IKCONFIG=y @@ -1116,7 +1114,7 @@ CONFIG_ZPOOL=y CONFIG_SWAP=y CONFIG_ZSWAP=y CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON=y +# CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set @@ -2328,7 +2326,7 @@ CONFIG_PREVENT_FIRMWARE_BUILD=y # Firmware loader # CONFIG_FW_LOADER=y -# CONFIG_FW_LOADER_DEBUG is not set +CONFIG_FW_LOADER_DEBUG=y CONFIG_FW_LOADER_PAGED_BUF=y CONFIG_FW_LOADER_SYSFS=y CONFIG_EXTRA_FIRMWARE="" @@ -6199,6 +6197,7 @@ CONFIG_DVB_BUDGET_CORE=m CONFIG_DVB_BUDGET=m CONFIG_DVB_BUDGET_CI=m CONFIG_DVB_BUDGET_AV=m +CONFIG_IPU_BRIDGE=m CONFIG_VIDEO_IPU3_CIO2=m CONFIG_CIO2_BRIDGE=y CONFIG_RADIO_ADAPTERS=m @@ -6383,10 +6382,7 @@ CONFIG_MEDIA_ATTACH=y # IR I2C driver auto-selected by 'Autoselect ancillary drivers' # CONFIG_VIDEO_IR_I2C=m - -# -# Camera sensor devices -# +CONFIG_VIDEO_CAMERA_SENSOR=y CONFIG_VIDEO_APTINA_PLL=m CONFIG_VIDEO_CCS_PLL=m CONFIG_VIDEO_AR0521=m @@ -6449,7 +6445,6 @@ CONFIG_VIDEO_S5K5BAF=m CONFIG_VIDEO_S5K6A3=m CONFIG_VIDEO_CCS=m CONFIG_VIDEO_ET8EK8=m -# end of Camera sensor devices # # Lens drivers @@ -7036,6 +7031,8 @@ CONFIG_SND_DMAENGINE_PCM=m CONFIG_SND_HWDEP=m CONFIG_SND_SEQ_DEVICE=m CONFIG_SND_RAWMIDI=m +CONFIG_SND_UMP=m +# CONFIG_SND_UMP_LEGACY_RAWMIDI is not set CONFIG_SND_COMPRESS_OFFLOAD=m CONFIG_SND_JACK=y CONFIG_SND_JACK_INPUT_DEV=y @@ -7070,6 +7067,7 @@ CONFIG_SND_SEQ_MIDI=m CONFIG_SND_SEQ_MIDI_EMUL=m CONFIG_SND_SEQ_VIRMIDI=m CONFIG_SND_SEQ_UMP=y +CONFIG_SND_SEQ_UMP_CLIENT=m CONFIG_SND_MPU401_UART=m CONFIG_SND_OPL3_LIB=m CONFIG_SND_OPL3_LIB_SEQ=m @@ -7210,7 +7208,7 @@ CONFIG_SND_INTEL_SOUNDWIRE_ACPI=m CONFIG_SND_SPI=y CONFIG_SND_USB=y CONFIG_SND_USB_AUDIO=m -# CONFIG_SND_USB_AUDIO_MIDI_V2 is not set +CONFIG_SND_USB_AUDIO_MIDI_V2=y CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y CONFIG_SND_USB_UA101=m CONFIG_SND_USB_USX2Y=m @@ -10515,7 +10513,7 @@ CONFIG_PNFS_FILE_LAYOUT=m CONFIG_PNFS_BLOCK=m CONFIG_PNFS_FLEXFILE_LAYOUT=m CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y +# CONFIG_NFS_V4_1_MIGRATION is not set CONFIG_NFS_V4_SECURITY_LABEL=y CONFIG_NFS_FSCACHE=y # CONFIG_NFS_USE_LEGACY_DNS is not set @@ -11465,7 +11463,7 @@ CONFIG_FTRACE=y CONFIG_BOOTTIME_TRACING=y CONFIG_FUNCTION_TRACER=y CONFIG_FUNCTION_GRAPH_TRACER=y -# CONFIG_FUNCTION_GRAPH_RETVAL is not set +CONFIG_FUNCTION_GRAPH_RETVAL=y CONFIG_DYNAMIC_FTRACE=y CONFIG_DYNAMIC_FTRACE_WITH_REGS=y CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index 9565648..17502f6 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,6 +1,6 @@ -From de38719bf3e0937c83054c911c5cf102eae632dd Mon Sep 17 00:00:00 2001 +From 064c49c5d144094d580b510ca4ca110469b1203d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 28 Aug 2023 14:01:05 +0200 +Date: Wed, 13 Sep 2023 14:31:11 +0200 Subject: [PATCH 1/7] amd-hdr Signed-off-by: Peter Jung @@ -28,7 +28,7 @@ Signed-off-by: Peter Jung 20 files changed, 1446 insertions(+), 128 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h -index 32fe05c810c6..84bf501b02f4 100644 +index 32fe05c810c6f..84bf501b02f4c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h @@ -343,6 +343,77 @@ struct amdgpu_mode_info { @@ -110,10 +110,10 @@ index 32fe05c810c6..84bf501b02f4 100644 #define AMDGPU_MAX_BL_LEVEL 0xFF diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index e5554a36e8c8..43ef0e5f97ae 100644 +index e0d556cf919f7..f8bc642d658cd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -3943,6 +3943,11 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) +@@ -4015,6 +4015,11 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) return r; } @@ -125,7 +125,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 r = amdgpu_dm_audio_init(adev); if (r) { dc_release_state(state->context); -@@ -4992,7 +4997,9 @@ static int fill_dc_plane_attributes(struct amdgpu_device *adev, +@@ -5064,7 +5069,9 @@ static int fill_dc_plane_attributes(struct amdgpu_device *adev, * Always set input transfer function, since plane state is refreshed * every time. */ @@ -136,7 +136,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 if (ret) return ret; -@@ -8007,6 +8014,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, +@@ -8079,6 +8086,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, bundle->surface_updates[planes_count].gamma = dc_plane->gamma_correction; bundle->surface_updates[planes_count].in_transfer_func = dc_plane->in_transfer_func; bundle->surface_updates[planes_count].gamut_remap_matrix = &dc_plane->gamut_remap_matrix; @@ -147,7 +147,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 } amdgpu_dm_plane_fill_dc_scaling_info(dm->adev, new_plane_state, -@@ -8215,6 +8226,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, +@@ -8289,6 +8300,10 @@ static void amdgpu_dm_commit_planes(struct drm_atomic_state *state, &acrtc_state->stream->csc_color_matrix; bundle->stream_update.out_transfer_func = acrtc_state->stream->out_transfer_func; @@ -158,7 +158,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 } acrtc_state->stream->abm_level = acrtc_state->abm_level; -@@ -9405,6 +9420,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, +@@ -9479,6 +9494,7 @@ static int dm_update_crtc_state(struct amdgpu_display_manager *dm, * when a modeset is needed, to ensure it gets reprogrammed. */ if (dm_new_crtc_state->base.color_mgmt_changed || @@ -166,7 +166,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 drm_atomic_crtc_needs_modeset(new_crtc_state)) { ret = amdgpu_dm_update_crtc_color_mgmt(dm_new_crtc_state); if (ret) -@@ -9472,6 +9488,10 @@ static bool should_reset_plane(struct drm_atomic_state *state, +@@ -9546,6 +9562,10 @@ static bool should_reset_plane(struct drm_atomic_state *state, */ for_each_oldnew_plane_in_state(state, other, old_other_state, new_other_state, i) { struct amdgpu_framebuffer *old_afb, *new_afb; @@ -177,7 +177,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 if (other->type == DRM_PLANE_TYPE_CURSOR) continue; -@@ -9508,6 +9528,18 @@ static bool should_reset_plane(struct drm_atomic_state *state, +@@ -9582,6 +9602,18 @@ static bool should_reset_plane(struct drm_atomic_state *state, old_other_state->color_encoding != new_other_state->color_encoding) return true; @@ -197,7 +197,7 @@ index e5554a36e8c8..43ef0e5f97ae 100644 if (!old_other_state->fb || !new_other_state->fb) continue; diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h -index 9fb5bb3a75a7..f92bbd7ed867 100644 +index 9fb5bb3a75a77..f92bbd7ed867b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -51,6 +51,8 @@ @@ -340,7 +340,7 @@ index 9fb5bb3a75a7..f92bbd7ed867 100644 void amdgpu_dm_update_connector_after_detect( diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c -index a4cb23d059bd..0a51af44efd5 100644 +index a4cb23d059bd6..0a51af44efd5f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c @@ -72,6 +72,7 @@ @@ -1286,7 +1286,7 @@ index a4cb23d059bd..0a51af44efd5 100644 + dc_plane_state, color_caps); +} diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c -index 30d4c6fd95f5..e7b38cce010c 100644 +index 440fc0869a34b..d746f0aa0f11c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c @@ -253,6 +253,7 @@ static struct drm_crtc_state *dm_crtc_duplicate_state(struct drm_crtc *crtc) @@ -1379,7 +1379,7 @@ index 30d4c6fd95f5..e7b38cce010c 100644 }; static void dm_crtc_helper_disable(struct drm_crtc *crtc) -@@ -482,6 +551,9 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, +@@ -470,6 +539,9 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); @@ -1390,10 +1390,10 @@ index 30d4c6fd95f5..e7b38cce010c 100644 fail: diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c -index 322668973747..60e5ffb1863d 100644 +index 6c84ca2ae373a..ea03c22437293 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c -@@ -1317,8 +1317,14 @@ static void dm_drm_plane_reset(struct drm_plane *plane) +@@ -1324,8 +1324,14 @@ static void dm_drm_plane_reset(struct drm_plane *plane) amdgpu_state = kzalloc(sizeof(*amdgpu_state), GFP_KERNEL); WARN_ON(amdgpu_state == NULL); @@ -1410,7 +1410,7 @@ index 322668973747..60e5ffb1863d 100644 } static struct drm_plane_state * -@@ -1338,6 +1344,22 @@ dm_drm_plane_duplicate_state(struct drm_plane *plane) +@@ -1345,6 +1351,22 @@ dm_drm_plane_duplicate_state(struct drm_plane *plane) dc_plane_state_retain(dm_plane_state->dc_state); } @@ -1433,7 +1433,7 @@ index 322668973747..60e5ffb1863d 100644 return &dm_plane_state->base; } -@@ -1405,12 +1427,203 @@ static void dm_drm_plane_destroy_state(struct drm_plane *plane, +@@ -1412,12 +1434,203 @@ static void dm_drm_plane_destroy_state(struct drm_plane *plane, { struct dm_plane_state *dm_plane_state = to_dm_plane_state(state); @@ -1637,7 +1637,7 @@ index 322668973747..60e5ffb1863d 100644 static const struct drm_plane_funcs dm_plane_funcs = { .update_plane = drm_atomic_helper_update_plane, .disable_plane = drm_atomic_helper_disable_plane, -@@ -1419,6 +1632,10 @@ static const struct drm_plane_funcs dm_plane_funcs = { +@@ -1426,6 +1639,10 @@ static const struct drm_plane_funcs dm_plane_funcs = { .atomic_duplicate_state = dm_drm_plane_duplicate_state, .atomic_destroy_state = dm_drm_plane_destroy_state, .format_mod_supported = dm_plane_format_mod_supported, @@ -1648,7 +1648,7 @@ index 322668973747..60e5ffb1863d 100644 }; int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, -@@ -1489,6 +1706,9 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, +@@ -1496,6 +1713,9 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, drm_plane_helper_add(plane, &dm_plane_helper_funcs); @@ -1659,7 +1659,7 @@ index 322668973747..60e5ffb1863d 100644 if (plane->funcs->reset) plane->funcs->reset(plane); diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c -index 3538973bd0c6..04b2e04b68f3 100644 +index 3538973bd0c6c..04b2e04b68f33 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_cm_common.c @@ -349,20 +349,37 @@ bool cm_helper_translate_curve_to_hw_format(struct dc_context *ctx, @@ -1777,7 +1777,7 @@ index 3538973bd0c6..04b2e04b68f3 100644 } } diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c -index bf8864bc8a99..72558eb877dc 100644 +index 4cd4ae07d73dc..4fb4e9ec03f1e 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.c @@ -186,6 +186,43 @@ bool dcn30_set_input_transfer_func(struct dc *dc, @@ -1825,7 +1825,7 @@ index bf8864bc8a99..72558eb877dc 100644 struct pipe_ctx *pipe_ctx, const struct dc_stream_state *stream) diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h -index a24a8e33a3d2..cb34ca932a5f 100644 +index a24a8e33a3d28..cb34ca932a5ff 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_hwseq.h @@ -58,6 +58,9 @@ bool dcn30_set_blend_lut(struct pipe_ctx *pipe_ctx, @@ -1839,7 +1839,7 @@ index a24a8e33a3d2..cb34ca932a5f 100644 struct pipe_ctx *pipe_ctx, const struct dc_stream_state *stream); diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c -index 257df8660b4c..81fd50ee97c3 100644 +index 61205cdbe2d5a..fdbe3d42cd7b6 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_init.c @@ -33,7 +33,7 @@ @@ -1852,7 +1852,7 @@ index 257df8660b4c..81fd50ee97c3 100644 .power_down_on_boot = dcn10_power_down_on_boot, .apply_ctx_to_hw = dce110_apply_ctx_to_hw, diff --git a/drivers/gpu/drm/amd/display/include/fixed31_32.h b/drivers/gpu/drm/amd/display/include/fixed31_32.h -index d4cf7ead1d87..84da1dd34efd 100644 +index d4cf7ead1d877..84da1dd34efd1 100644 --- a/drivers/gpu/drm/amd/display/include/fixed31_32.h +++ b/drivers/gpu/drm/amd/display/include/fixed31_32.h @@ -69,6 +69,18 @@ static const struct fixed31_32 dc_fixpt_epsilon = { 1LL }; @@ -1875,7 +1875,7 @@ index d4cf7ead1d87..84da1dd34efd 100644 * @brief * Initialization routines diff --git a/drivers/gpu/drm/arm/malidp_crtc.c b/drivers/gpu/drm/arm/malidp_crtc.c -index dc01c43f6193..d72c22dcf685 100644 +index dc01c43f61930..d72c22dcf6855 100644 --- a/drivers/gpu/drm/arm/malidp_crtc.c +++ b/drivers/gpu/drm/arm/malidp_crtc.c @@ -221,7 +221,7 @@ static int malidp_crtc_atomic_check_ctm(struct drm_crtc *crtc, @@ -1888,7 +1888,7 @@ index dc01c43f6193..d72c22dcf685 100644 ctm = (struct drm_color_ctm *)state->ctm->data; for (i = 0; i < ARRAY_SIZE(ctm->matrix); ++i) { diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c -index c277b198fa3f..c3df45f90145 100644 +index c277b198fa3fa..c3df45f901456 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -733,6 +733,7 @@ static void drm_atomic_plane_print_state(struct drm_printer *p, @@ -1900,7 +1900,7 @@ index c277b198fa3f..c3df45f90145 100644 if (plane->funcs->atomic_print_state) plane->funcs->atomic_print_state(p, state); diff --git a/drivers/gpu/drm/drm_atomic_state_helper.c b/drivers/gpu/drm/drm_atomic_state_helper.c -index 784e63d70a42..25bb0859fda7 100644 +index 784e63d70a421..25bb0859fda74 100644 --- a/drivers/gpu/drm/drm_atomic_state_helper.c +++ b/drivers/gpu/drm/drm_atomic_state_helper.c @@ -338,6 +338,7 @@ void __drm_atomic_helper_plane_duplicate_state(struct drm_plane *plane, @@ -1912,7 +1912,7 @@ index 784e63d70a42..25bb0859fda7 100644 EXPORT_SYMBOL(__drm_atomic_helper_plane_duplicate_state); diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c -index d867e7f9f2cd..a6a9ee5086dd 100644 +index d867e7f9f2cd5..a6a9ee5086ddb 100644 --- a/drivers/gpu/drm/drm_atomic_uapi.c +++ b/drivers/gpu/drm/drm_atomic_uapi.c @@ -362,39 +362,6 @@ static s32 __user *get_out_fence_for_connector(struct drm_atomic_state *state, @@ -2001,7 +2001,7 @@ index d867e7f9f2cd..a6a9ee5086dd 100644 val, sizeof(struct hdr_output_metadata), -1, diff --git a/drivers/gpu/drm/drm_property.c b/drivers/gpu/drm/drm_property.c -index dfec479830e4..f72ef6493340 100644 +index dfec479830e49..f72ef6493340a 100644 --- a/drivers/gpu/drm/drm_property.c +++ b/drivers/gpu/drm/drm_property.c @@ -751,6 +751,55 @@ bool drm_property_replace_blob(struct drm_property_blob **blob, @@ -2061,7 +2061,7 @@ index dfec479830e4..f72ef6493340 100644 void *data, struct drm_file *file_priv) { diff --git a/include/drm/drm_mode_object.h b/include/drm/drm_mode_object.h -index 912f1e415685..08d7a7f0188f 100644 +index 912f1e4156853..08d7a7f0188fe 100644 --- a/include/drm/drm_mode_object.h +++ b/include/drm/drm_mode_object.h @@ -60,7 +60,7 @@ struct drm_mode_object { @@ -2074,7 +2074,7 @@ index 912f1e415685..08d7a7f0188f 100644 * struct drm_object_properties - property tracking for &drm_mode_object */ diff --git a/include/drm/drm_plane.h b/include/drm/drm_plane.h -index 51291983ea44..52c3287da0da 100644 +index 51291983ea445..52c3287da0daa 100644 --- a/include/drm/drm_plane.h +++ b/include/drm/drm_plane.h @@ -237,6 +237,13 @@ struct drm_plane_state { @@ -2092,7 +2092,7 @@ index 51291983ea44..52c3287da0da 100644 static inline struct drm_rect diff --git a/include/drm/drm_property.h b/include/drm/drm_property.h -index 65bc9710a470..082f29156b3e 100644 +index 65bc9710a4702..082f29156b3e3 100644 --- a/include/drm/drm_property.h +++ b/include/drm/drm_property.h @@ -279,6 +279,12 @@ struct drm_property_blob *drm_property_create_blob(struct drm_device *dev, @@ -2109,7 +2109,7 @@ index 65bc9710a470..082f29156b3e 100644 struct drm_property_blob **replace, size_t length, diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h -index 43691058d28f..23fc19400998 100644 +index 43691058d28fb..23fc194009980 100644 --- a/include/uapi/drm/drm_mode.h +++ b/include/uapi/drm/drm_mode.h @@ -843,6 +843,14 @@ struct drm_color_ctm { @@ -2130,28 +2130,28 @@ index 43691058d28f..23fc19400998 100644 -- 2.42.0 -From f43591177032844d0dec73debda8218267d6d2ef Mon Sep 17 00:00:00 2001 +From ec46dbc46816854bf4a300cc5772da05f743df52 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 28 Aug 2023 14:01:19 +0200 +Date: Tue, 19 Sep 2023 14:30:49 +0200 Subject: [PATCH 2/7] amd-pref-core Signed-off-by: Peter Jung --- .../admin-guide/kernel-parameters.txt | 5 + - Documentation/admin-guide/pm/amd-pstate.rst | 53 ++++++ + Documentation/admin-guide/pm/amd-pstate.rst | 58 +++++- arch/x86/Kconfig | 5 +- drivers/acpi/cppc_acpi.c | 13 ++ drivers/acpi/processor_driver.c | 6 + - drivers/cpufreq/amd-pstate-ut.c | 50 +++--- - drivers/cpufreq/amd-pstate.c | 152 ++++++++++++++++-- + drivers/cpufreq/amd-pstate-ut.c | 4 +- + drivers/cpufreq/amd-pstate.c | 197 ++++++++++++++++-- drivers/cpufreq/cpufreq.c | 13 ++ include/acpi/cppc_acpi.h | 5 + - include/linux/amd-pstate.h | 1 + - include/linux/cpufreq.h | 4 + - 11 files changed, 259 insertions(+), 48 deletions(-) + include/linux/amd-pstate.h | 6 + + include/linux/cpufreq.h | 5 + + 11 files changed, 293 insertions(+), 24 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 722b6eca2e93..ac95d4c9666e 100644 +index 23ebe34ff901e..f23ec4dc6c4b9 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -363,6 +363,11 @@ @@ -2161,20 +2161,31 @@ index 722b6eca2e93..ac95d4c9666e 100644 + amd_prefcore= + [X86] + disable -+ Disable AMD Pstate Preferred Core. ++ Disable amd-pstate preferred core. + amijoy.map= [HW,JOY] Amiga joystick support Map of devices attached to JOY0DAT and JOY1DAT Format: , diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index 1cf40f69278c..2369b58a3521 100644 +index 1cf40f69278cd..b729bc6dabd80 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -353,6 +353,47 @@ is activated. In this mode, driver requests minimum and maximum performance +@@ -300,8 +300,8 @@ platforms. The AMD P-States mechanism is the more performance and energy + efficiency frequency management method on AMD processors. + + +-AMD Pstate Driver Operation Modes +-================================= ++``amd-pstate`` Driver Operation Modes ++====================================== + + ``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, + non-autonomous (passive) mode and guided autonomous (guided) mode. +@@ -353,6 +353,48 @@ is activated. In this mode, driver requests minimum and maximum performance level and the platform autonomously selects a performance level in this range and appropriate to the current workload. -+AMD Pstate Preferred Core ++``amd-pstate`` Preferred Core +================================= + +The core frequency is subjected to the process variation in semiconductors. @@ -2185,60 +2196,61 @@ index 1cf40f69278c..2369b58a3521 100644 +scenario, OS needs to know the core ordering informed by the platform through +highest performance capability register of the CPPC interface. + -+``AMD Pstate Preferred Core`` enable the scheduler to favor scheduling on cores -+can be get a higher frequency with lower voltage under preferred core. -+And it has the ability to dynamically change the preferred core based on the -+workload and platform conditions and accounting for thermals and aging. ++``amd-pstate`` preferred core enables the scheduler to prefer scheduling on ++cores that can achieve a higher frequency with lower voltage. The preferred ++core rankings can dynamically change based on the workload, platform conditions, ++thermals and ageing. + -+The priority metric will be initialized by the AMD Pstate driver. The AMD Pstate -+driver will also determine whether or not ``AMD Pstate Preferred Core`` is ++The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate`` ++driver will also determine whether or not ``amd-pstate`` preferred core is +supported by the platform. + -+AMD Pstate driver will provide an initial core ordering when the system boots. ++``amd-pstate`` driver will provide an initial core ordering when the system boots. +The platform uses the CPPC interfaces to communicate the core ranking to the +operating system and scheduler to make sure that OS is choosing the cores -+with highest performance firstly for scheduling the process. When AMD Pstate ++with highest performance firstly for scheduling the process. When ``amd-pstate`` +driver receives a message with the highest performance change, it will +update the core ranking and set the cpu's priority. + -+AMD Preferred Core Switch ++``amd-pstate`` Preferred Core Switch +================================= +Kernel Parameters +----------------- + -+``AMD Pstate Preferred Core`` has two states: enable and disable. ++``amd-pstate`` peferred core`` has two states: enable and disable. +Enable/disable states can be chosen by different kernel parameters. -+Default enable ``AMD Pstate Preferred Core``. ++Default enable ``amd-pstate`` preferred core. + +``amd_prefcore=disable`` + -+``AMD Pstate Preferred Core`` will be enabled if the underlying platform -+supports it. It can be disabled by kernerl parameter: ``amd_prefcore=disable``. ++For systems that support ``amd-pstate`` preferred core, the core rankings will ++always be advertised by the platform. But OS can choose to ignore that via the ++kernel parameter ``amd_prefcore=disable``. + User Space Interface in ``sysfs`` - General =========================================== -@@ -385,6 +426,18 @@ control its functionality at the system level. They are located in the +@@ -385,6 +427,18 @@ control its functionality at the system level. They are located in the to the operation mode represented by that string - or to be unregistered in the "disable" case. +``prefcore`` -+ Preferred Core state of the driver: "enabled" or "disabled". ++ Preferred core state of the driver: "enabled" or "disabled". + + "enabled" -+ Enable the AMD Preferred Core. ++ Enable the ``amd-pstate`` preferred core. + + "disabled" -+ Disable the AMD Preferred Core ++ Disable the ``amd-pstate`` preferred core + + -+ This attribute is read-only to check the state of Preferred Core. ++ This attribute is read-only to check the state of preferred core. + ``cpupower`` tool support for ``amd-pstate`` =============================================== diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index e36261b4ea14..16df141bd8a2 100644 +index e36261b4ea14f..14d0741139801 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1052,8 +1052,9 @@ config SCHED_MC @@ -2249,12 +2261,12 @@ index e36261b4ea14..16df141bd8a2 100644 - select X86_INTEL_PSTATE + depends on SCHED_MC + select X86_INTEL_PSTATE if CPU_SUP_INTEL -+ select X86_AMD_PSTATE if CPU_SUP_AMD ++ select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI select CPU_FREQ default y help diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c -index 7ff269a78c20..ad388a0e8484 100644 +index 7ff269a78c208..ad388a0e84842 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) @@ -2278,7 +2290,7 @@ index 7ff269a78c20..ad388a0e8484 100644 * cppc_get_epp_perf - Get the epp register value. * @cpunum: CPU from which to get epp preference value. diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c -index 4bd16b3f0781..29b2fb68a35d 100644 +index 4bd16b3f07814..29b2fb68a35db 100644 --- a/drivers/acpi/processor_driver.c +++ b/drivers/acpi/processor_driver.c @@ -27,6 +27,7 @@ @@ -2302,40 +2314,10 @@ index 4bd16b3f0781..29b2fb68a35d 100644 acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); break; diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c -index 7f3fe2048981..f04ae67dda37 100644 +index 502d494499ae8..f04ae67dda372 100644 --- a/drivers/cpufreq/amd-pstate-ut.c +++ b/drivers/cpufreq/amd-pstate-ut.c -@@ -64,27 +64,9 @@ static struct amd_pstate_ut_struct amd_pstate_ut_cases[] = { - static bool get_shared_mem(void) - { - bool result = false; -- char path[] = "/sys/module/amd_pstate/parameters/shared_mem"; -- char buf[5] = {0}; -- struct file *filp = NULL; -- loff_t pos = 0; -- ssize_t ret; -- -- if (!boot_cpu_has(X86_FEATURE_CPPC)) { -- filp = filp_open(path, O_RDONLY, 0); -- if (IS_ERR(filp)) -- pr_err("%s unable to open %s file!\n", __func__, path); -- else { -- ret = kernel_read(filp, &buf, sizeof(buf), &pos); -- if (ret < 0) -- pr_err("%s read %s file fail ret=%ld!\n", -- __func__, path, (long)ret); -- filp_close(filp, NULL); -- } - -- if ('Y' == *buf) -- result = true; -- } -+ if (!boot_cpu_has(X86_FEATURE_CPPC)) -+ result = true; - - return result; - } -@@ -145,8 +127,6 @@ static void amd_pstate_ut_check_perf(u32 index) +@@ -127,8 +127,6 @@ static void amd_pstate_ut_check_perf(u32 index) struct cpufreq_policy *policy = NULL; struct amd_cpudata *cpudata = NULL; @@ -2344,98 +2326,24 @@ index 7f3fe2048981..f04ae67dda37 100644 for_each_possible_cpu(cpu) { policy = cpufreq_cpu_get(cpu); if (!policy) -@@ -158,9 +138,10 @@ static void amd_pstate_ut_check_perf(u32 index) - if (ret) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s cppc_get_perf_caps ret=%d error!\n", __func__, ret); -- return; -+ goto skip_test; +@@ -143,6 +141,7 @@ static void amd_pstate_ut_check_perf(u32 index) + goto skip_test; } + highest_perf = cppc_perf.highest_perf; nominal_perf = cppc_perf.nominal_perf; lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; lowest_perf = cppc_perf.lowest_perf; -@@ -169,9 +150,10 @@ static void amd_pstate_ut_check_perf(u32 index) - if (ret) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s read CPPC_CAP1 ret=%d error!\n", __func__, ret); -- return; -+ goto skip_test; +@@ -154,6 +153,7 @@ static void amd_pstate_ut_check_perf(u32 index) + goto skip_test; } + highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); nominal_perf = AMD_CPPC_NOMINAL_PERF(cap1); lowest_nonlinear_perf = AMD_CPPC_LOWNONLIN_PERF(cap1); lowest_perf = AMD_CPPC_LOWEST_PERF(cap1); -@@ -187,7 +169,7 @@ static void amd_pstate_ut_check_perf(u32 index) - nominal_perf, cpudata->nominal_perf, - lowest_nonlinear_perf, cpudata->lowest_nonlinear_perf, - lowest_perf, cpudata->lowest_perf); -- return; -+ goto skip_test; - } - - if (!((highest_perf >= nominal_perf) && -@@ -198,11 +180,15 @@ static void amd_pstate_ut_check_perf(u32 index) - pr_err("%s cpu%d highest=%d >= nominal=%d > lowest_nonlinear=%d > lowest=%d > 0, the formula is incorrect!\n", - __func__, cpu, highest_perf, nominal_perf, - lowest_nonlinear_perf, lowest_perf); -- return; -+ goto skip_test; - } -+ cpufreq_cpu_put(policy); - } - - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; -+ return; -+skip_test: -+ cpufreq_cpu_put(policy); - } - - /* -@@ -230,14 +216,14 @@ static void amd_pstate_ut_check_freq(u32 index) - pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", - __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, - cpudata->lowest_nonlinear_freq, cpudata->min_freq); -- return; -+ goto skip_test; - } - - if (cpudata->min_freq != policy->min) { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n", - __func__, cpu, cpudata->min_freq, policy->min); -- return; -+ goto skip_test; - } - - if (cpudata->boost_supported) { -@@ -249,16 +235,20 @@ static void amd_pstate_ut_check_freq(u32 index) - pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", - __func__, cpu, policy->max, cpudata->max_freq, - cpudata->nominal_freq); -- return; -+ goto skip_test; - } - } else { - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; - pr_err("%s cpu%d must support boost!\n", __func__, cpu); -- return; -+ goto skip_test; - } -+ cpufreq_cpu_put(policy); - } - - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; -+ return; -+skip_test: -+ cpufreq_cpu_put(policy); - } - - static int __init amd_pstate_ut_init(void) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 9a1e194d5cf8..8a8e4ecb1b5c 100644 +index 9a1e194d5cf88..97b1d4674b4f0 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -37,6 +37,7 @@ @@ -2455,17 +2363,20 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 /* * TODO: We need more time to fine tune processors with shared memory solution -@@ -65,6 +68,9 @@ static struct cpufreq_driver amd_pstate_epp_driver; +@@ -65,6 +68,12 @@ static struct cpufreq_driver amd_pstate_epp_driver; static int cppc_state = AMD_PSTATE_UNDEFINED; static bool cppc_enabled; ++/*HW preferred Core featue is supported*/ ++static bool hw_prefcore = true; ++ +/*Preferred Core featue is supported*/ +static bool prefcore = true; + /* * AMD Energy Preference Performance (EPP) * The EPP is used in the CCLK DPM controller to drive -@@ -290,27 +296,26 @@ static inline int amd_pstate_enable(bool enable) +@@ -290,27 +299,26 @@ static inline int amd_pstate_enable(bool enable) static int pstate_init_perf(struct amd_cpudata *cpudata) { u64 cap1; @@ -2490,7 +2401,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 - highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -+ if (prefcore) ++ if (hw_prefcore) + WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD); + else + WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); @@ -2498,11 +2409,11 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); -+ WRITE_ONCE(cpudata->prefcore_highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); return 0; } -@@ -318,22 +323,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) +@@ -318,22 +326,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) static int cppc_init_perf(struct amd_cpudata *cpudata) { struct cppc_perf_caps cppc_perf; @@ -2517,7 +2428,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 - highest_perf = cppc_perf.highest_perf; - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -+ if (prefcore) ++ if (hw_prefcore) + WRITE_ONCE(cpudata->highest_perf, AMD_PSTATE_PREFCORE_THRESHOLD); + else + WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); @@ -2526,16 +2437,25 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 WRITE_ONCE(cpudata->lowest_nonlinear_perf, cppc_perf.lowest_nonlinear_perf); WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); -+ WRITE_ONCE(cpudata->prefcore_highest_perf, cppc_perf.highest_perf); ++ WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); if (cppc_state == AMD_PSTATE_ACTIVE) return 0; -@@ -676,6 +680,100 @@ static void amd_perf_ctl_reset(unsigned int cpu) +@@ -540,7 +547,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + if (target_perf < capacity) + des_perf = DIV_ROUND_UP(cap_perf * target_perf, capacity); + +- min_perf = READ_ONCE(cpudata->highest_perf); ++ min_perf = READ_ONCE(cpudata->lowest_perf); + if (_min_perf < capacity) + min_perf = DIV_ROUND_UP(cap_perf * _min_perf, capacity); + +@@ -676,6 +683,116 @@ static void amd_perf_ctl_reset(unsigned int cpu) wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); } +/* -+ * Set AMD Pstate Preferred Core enable can't be done directly from cpufreq callbacks ++ * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks + * due to locking, so queue the work for later. + */ +static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) @@ -2544,69 +2464,38 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 +} +static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); + -+/** ++/* + * Get the highest performance register value. + * @cpu: CPU from which to get highest performance. + * @highest_perf: Return address. + * + * Return: 0 for success, -EIO otherwise. + */ -+static int amd_pstate_get_highest_perf(int cpu, u64 *highest_perf) ++static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) +{ -+ int ret; ++ int ret; ++ u64 cppc_highest_perf; + -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ u64 cap1; ++ if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ u64 cap1; + -+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); -+ if (ret) -+ return ret; -+ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); -+ } else { -+ ret = cppc_get_highest_perf(cpu, highest_perf); -+ } -+ -+ return (ret); -+} -+ -+static void amd_pstate_init_prefcore(void) -+{ -+ int cpu, ret; -+ u64 highest_perf; -+ -+ if (!prefcore) -+ return; -+ -+ for_each_online_cpu(cpu) { -+ ret = amd_pstate_get_highest_perf(cpu, &highest_perf); ++ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); + if (ret) -+ break; -+ -+ sched_set_itmt_core_prio(highest_perf, cpu); -+ -+ /* check if CPPC preferred core feature is enabled*/ -+ if (highest_perf == AMD_PSTATE_MAX_CPPC_PERF) { -+ prefcore = false; -+ return; -+ } ++ return ret; ++ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ } else { ++ ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); ++ *highest_perf = (u32)(cppc_highest_perf & 0xFFFF); + } + -+ /* -+ * This code can be run during CPU online under the -+ * CPU hotplug locks, so sched_set_amd_prefcore_support() -+ * cannot be called from here. Queue up a work item -+ * to invoke it. -+ */ -+ schedule_work(&sched_prefcore_work); ++ return (ret); +} + -+static void amd_pstate_update_highest_perf(unsigned int cpu) ++static void amd_pstate_init_prefcore(unsigned int cpu) +{ -+ struct cpufreq_policy *policy; -+ struct amd_cpudata *cpudata; -+ u32 prev_high = 0, cur_high = 0; -+ u64 highest_perf; + int ret; ++ u32 highest_perf; ++ static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; + + if (!prefcore) + return; @@ -2615,13 +2504,60 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 + if (ret) + return; + ++ /* ++ * The priorities can be set regardless of whether or not ++ * sched_set_itmt_support(true) has been called and it is valid to ++ * update them at any time after it has been called. ++ */ ++ sched_set_itmt_core_prio(highest_perf, cpu); ++ ++ /* check if CPPC preferred core feature is enabled*/ ++ if (highest_perf == AMD_PSTATE_MAX_CPPC_PERF) { ++ pr_debug("AMD CPPC preferred core is unsupported!\n"); ++ hw_prefcore = false; ++ prefcore = false; ++ return; ++ } ++ ++ if (max_highest_perf <= min_highest_perf) { ++ if (highest_perf > max_highest_perf) ++ max_highest_perf = highest_perf; ++ ++ if (highest_perf < min_highest_perf) ++ min_highest_perf = highest_perf; ++ ++ if (max_highest_perf > min_highest_perf) { ++ /* ++ * This code can be run during CPU online under the ++ * CPU hotplug locks, so sched_set_itmt_support() ++ * cannot be called from here. Queue up a work item ++ * to invoke it. ++ */ ++ schedule_work(&sched_prefcore_work); ++ } ++ } ++} ++ ++static void amd_pstate_update_highest_perf(unsigned int cpu) ++{ ++ struct cpufreq_policy *policy; ++ struct amd_cpudata *cpudata; ++ u32 prev_high = 0, cur_high = 0; ++ int ret; ++ ++ if (!prefcore) ++ return; ++ ++ ret = amd_pstate_get_highest_perf(cpu, &cur_high); ++ if (ret) ++ return; ++ + policy = cpufreq_cpu_get(cpu); + cpudata = policy->driver_data; -+ cur_high = highest_perf; -+ prev_high = READ_ONCE(cpudata->prefcore_highest_perf); ++ prev_high = READ_ONCE(cpudata->prefcore_ranking); + + if (prev_high != cur_high) { -+ WRITE_ONCE(cpudata->prefcore_highest_perf, cur_high); ++ WRITE_ONCE(cpudata->prefcore_ranking, cur_high); + sched_set_itmt_core_prio(cur_high, cpu); + } + @@ -2631,12 +2567,53 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 static int amd_pstate_cpu_init(struct cpufreq_policy *policy) { int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; -@@ -1037,6 +1135,12 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, +@@ -697,6 +814,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + + cpudata->cpu = policy->cpu; + ++ amd_pstate_init_prefcore(policy->cpu); ++ + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; +@@ -763,6 +882,22 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + return ret; + } + ++static int amd_pstate_cpu_online(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ ++ pr_debug("CPU %d going online\n", cpudata->cpu); ++ ++ amd_pstate_init_prefcore(cpudata->cpu); ++ ++ return 0; ++} ++ ++static int amd_pstate_cpu_offline(struct cpufreq_policy *policy) ++{ ++ return 0; ++} ++ + static int amd_pstate_cpu_exit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +@@ -840,7 +975,7 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, + u32 perf; + struct amd_cpudata *cpudata = policy->driver_data; + +- perf = READ_ONCE(cpudata->highest_perf); ++ perf = READ_ONCE(cpudata->prefcore_ranking); + + return sysfs_emit(buf, "%u\n", perf); + } +@@ -1037,6 +1172,12 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, return ret < 0 ? ret : count; } +static ssize_t prefcore_show(struct device *dev, -+ struct device_attribute *attr, char *buf) ++ struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%s\n", prefcore ? "enabled" : "disabled"); +} @@ -2644,7 +2621,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 cpufreq_freq_attr_ro(amd_pstate_max_freq); cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); -@@ -1044,6 +1148,7 @@ cpufreq_freq_attr_ro(amd_pstate_highest_perf); +@@ -1044,6 +1185,7 @@ cpufreq_freq_attr_ro(amd_pstate_highest_perf); cpufreq_freq_attr_rw(energy_performance_preference); cpufreq_freq_attr_ro(energy_performance_available_preferences); static DEVICE_ATTR_RW(status); @@ -2652,7 +2629,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 static struct freq_attr *amd_pstate_attr[] = { &amd_pstate_max_freq, -@@ -1063,6 +1168,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { +@@ -1063,6 +1205,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { static struct attribute *pstate_global_attributes[] = { &dev_attr_status.attr, @@ -2660,7 +2637,30 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 NULL }; -@@ -1392,6 +1498,7 @@ static struct cpufreq_driver amd_pstate_driver = { +@@ -1114,6 +1257,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + cpudata->cpu = policy->cpu; + cpudata->epp_policy = 0; + ++ amd_pstate_init_prefcore(policy->cpu); ++ + ret = amd_pstate_init_perf(cpudata); + if (ret) + goto free_cpudata1; +@@ -1285,6 +1430,8 @@ static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + ++ amd_pstate_init_prefcore(cpudata->cpu); ++ + if (cppc_state == AMD_PSTATE_ACTIVE) { + amd_pstate_epp_reenable(cpudata); + cpudata->suspended = false; +@@ -1389,9 +1536,12 @@ static struct cpufreq_driver amd_pstate_driver = { + .fast_switch = amd_pstate_fast_switch, + .init = amd_pstate_cpu_init, + .exit = amd_pstate_cpu_exit, ++ .offline = amd_pstate_cpu_offline, ++ .online = amd_pstate_cpu_online, .suspend = amd_pstate_cpu_suspend, .resume = amd_pstate_cpu_resume, .set_boost = amd_pstate_set_boost, @@ -2668,7 +2668,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 .name = "amd-pstate", .attr = amd_pstate_attr, }; -@@ -1406,6 +1513,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { +@@ -1406,6 +1556,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .online = amd_pstate_epp_cpu_online, .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, @@ -2676,16 +2676,7 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; -@@ -1506,6 +1614,8 @@ static int __init amd_pstate_init(void) - } - } - -+ amd_pstate_init_prefcore(); -+ - return ret; - - global_attr_free: -@@ -1527,7 +1637,17 @@ static int __init amd_pstate_param(char *str) +@@ -1527,7 +1678,17 @@ static int __init amd_pstate_param(char *str) return amd_pstate_set_driver(mode_idx); } @@ -2704,10 +2695,10 @@ index 9a1e194d5cf8..8a8e4ecb1b5c 100644 MODULE_AUTHOR("Huang Rui "); MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 50bbc969ffe5..842357abfae6 100644 +index 5c655d7b96d4f..abefdb30fcaa2 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c -@@ -2675,6 +2675,19 @@ void cpufreq_update_limits(unsigned int cpu) +@@ -2677,6 +2677,19 @@ void cpufreq_update_limits(unsigned int cpu) } EXPORT_SYMBOL_GPL(cpufreq_update_limits); @@ -2728,7 +2719,7 @@ index 50bbc969ffe5..842357abfae6 100644 * BOOST * *********************************************************************/ diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index 6126c977ece0..c0b69ffe7bdb 100644 +index 6126c977ece04..c0b69ffe7bdb4 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -139,6 +139,7 @@ struct cppc_cpudata { @@ -2751,19 +2742,36 @@ index 6126c977ece0..c0b69ffe7bdb 100644 { return -ENOTSUPP; diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 446394f84606..fa86bc953d3e 100644 +index 446394f846064..030a6a97c2b94 100644 --- a/include/linux/amd-pstate.h +++ b/include/linux/amd-pstate.h -@@ -70,6 +70,7 @@ struct amd_cpudata { +@@ -39,11 +39,16 @@ struct amd_aperf_mperf { + * @cppc_req_cached: cached performance request hints + * @highest_perf: the maximum performance an individual processor may reach, + * assuming ideal conditions ++ * For platforms that do not support the preferred core feature, the ++ * highest_pef may be configured with 166 or 255, to avoid max frequency ++ * calculated wrongly. we take the fixed value as the highest_perf. + * @nominal_perf: the maximum sustained performance level of the processor, + * assuming ideal operating conditions + * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power + * savings are achieved + * @lowest_perf: the absolute lowest performance level of the processor ++ * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher ++ * priority. + * @max_freq: the frequency that mapped to highest_perf + * @min_freq: the frequency that mapped to lowest_perf + * @nominal_freq: the frequency that mapped to nominal_perf +@@ -70,6 +75,7 @@ struct amd_cpudata { u32 nominal_perf; u32 lowest_nonlinear_perf; u32 lowest_perf; -+ u32 prefcore_highest_perf; ++ u32 prefcore_ranking; u32 max_freq; u32 min_freq; diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index 172ff51c1b2a..766c83a4fae7 100644 +index 172ff51c1b2a4..9ca50c4e19d35 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -231,6 +231,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); @@ -2774,7 +2782,15 @@ index 172ff51c1b2a..766c83a4fae7 100644 bool have_governor_per_policy(void); bool cpufreq_supports_freq_invariance(void); struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); -@@ -376,6 +377,9 @@ struct cpufreq_driver { +@@ -259,6 +260,7 @@ static inline bool cpufreq_supports_freq_invariance(void) + return false; + } + static inline void disable_cpufreq(void) { } ++static inline void cpufreq_update_highest_perf(unsigned int cpu) { } + #endif + + #ifdef CONFIG_CPU_FREQ_STAT +@@ -376,6 +378,9 @@ struct cpufreq_driver { /* Called to update policy limits on firmware notifications. */ void (*update_limits)(unsigned int cpu); @@ -2787,9 +2803,9 @@ index 172ff51c1b2a..766c83a4fae7 100644 -- 2.42.0 -From b35ba9f5a6ca4ac70053f1120b2042daa320ea59 Mon Sep 17 00:00:00 2001 +From 726724b1b77008f7c4532a424379d373fb8c405d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 13 Aug 2023 22:53:18 +0200 +Date: Wed, 13 Sep 2023 14:31:43 +0200 Subject: [PATCH 3/7] bbr3 Signed-off-by: Peter Jung @@ -2812,7 +2828,7 @@ Signed-off-by: Peter Jung 15 files changed, 1934 insertions(+), 551 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index 91a37c99ba66..ae0ee688c3f7 100644 +index 91a37c99ba665..ae0ee688c3f7b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -255,7 +255,9 @@ struct tcp_sock { @@ -2827,7 +2843,7 @@ index 91a37c99ba66..ae0ee688c3f7 100644 u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index c2b15f7e5516..a400a84088d3 100644 +index c2b15f7e55161..a400a84088d38 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -135,8 +135,8 @@ struct inet_connection_sock { @@ -2842,10 +2858,10 @@ index c2b15f7e5516..a400a84088d3 100644 #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index 0ca972ebd3dd..8eb194559b70 100644 +index 10fc5c5928f71..f0f4a343df8cf 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h -@@ -370,6 +370,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, +@@ -369,6 +369,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk, #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 @@ -2854,7 +2870,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -723,6 +725,15 @@ static inline void tcp_fast_path_check(struct sock *sk) +@@ -722,6 +724,15 @@ static inline void tcp_fast_path_check(struct sock *sk) tcp_fast_path_on(tp); } @@ -2870,7 +2886,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { -@@ -819,6 +830,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +@@ -818,6 +829,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } @@ -2882,7 +2898,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); -@@ -894,9 +910,14 @@ struct tcp_skb_cb { +@@ -893,9 +909,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -2899,7 +2915,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1000,6 +1021,7 @@ enum tcp_ca_event { +@@ -999,6 +1020,7 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ @@ -2907,7 +2923,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -@@ -1022,7 +1044,11 @@ enum tcp_ca_ack_event_flags { +@@ -1021,7 +1043,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -2920,7 +2936,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 union tcp_cc_info; -@@ -1042,10 +1068,13 @@ struct ack_sample { +@@ -1041,10 +1067,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -2935,7 +2951,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1056,7 +1085,9 @@ struct rate_sample { +@@ -1055,7 +1084,9 @@ struct rate_sample { u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ @@ -2945,7 +2961,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 }; struct tcp_congestion_ops { -@@ -1080,8 +1111,11 @@ struct tcp_congestion_ops { +@@ -1079,8 +1110,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -2959,7 +2975,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1147,6 +1181,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1146,6 +1180,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -2974,7 +2990,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1166,6 +1208,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1165,6 +1207,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -2982,7 +2998,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); -@@ -1178,6 +1221,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +@@ -1177,6 +1220,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } @@ -3004,7 +3020,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. -@@ -2177,7 +2235,7 @@ struct tcp_plb_state { +@@ -2176,7 +2234,7 @@ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ @@ -3014,7 +3030,7 @@ index 0ca972ebd3dd..8eb194559b70 100644 static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h -index 50655de04c9b..82f8bd8f0d16 100644 +index 50655de04c9b6..82f8bd8f0d161 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -229,6 +229,29 @@ struct tcp_bbr_info { @@ -3048,7 +3064,7 @@ index 50655de04c9b..82f8bd8f0d16 100644 union tcp_cc_info { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h -index 51c13cf9c5ae..de8dcba26bec 100644 +index 51c13cf9c5aee..de8dcba26becc 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -506,9 +506,11 @@ enum { @@ -3065,7 +3081,7 @@ index 51c13cf9c5ae..de8dcba26bec 100644 struct rta_session { __u8 proto; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h -index 879eeb0a084b..77270053a5e3 100644 +index 879eeb0a084b4..77270053a5e39 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { @@ -3077,7 +3093,7 @@ index 879eeb0a084b..77270053a5e3 100644 /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 2dfb12230f08..2e14db3bee70 100644 +index 2dfb12230f089..2e14db3bee704 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -668,15 +668,18 @@ config TCP_CONG_BBR @@ -3109,7 +3125,7 @@ index 2dfb12230f08..2e14db3bee70 100644 choice prompt "Default TCP congestion control" diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 8ed52e1e3c99..0198ac17f3a8 100644 +index 75f24b931a185..25136face23c0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3083,6 +3083,7 @@ int tcp_disconnect(struct sock *sk, int flags) @@ -3130,7 +3146,7 @@ index 8ed52e1e3c99..0198ac17f3a8 100644 info->tcpi_options |= TCPI_OPT_SYN_DATA; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 146792cd26fe..f4f477a69917 100644 +index 146792cd26fed..f4f477a69917d 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1,18 +1,19 @@ @@ -5775,7 +5791,7 @@ index 146792cd26fe..f4f477a69917 100644 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 1b34050a7538..66d40449b3f4 100644 +index 1b34050a7538b..66d40449b3f4f 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk) @@ -5787,10 +5803,10 @@ index 1b34050a7538..66d40449b3f4 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 57c8af1859c1..2195ba488142 100644 +index 48c2b96b08435..64f90414294b9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -349,7 +349,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -348,7 +348,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: @@ -5799,7 +5815,7 @@ index 57c8af1859c1..2195ba488142 100644 tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { -@@ -360,7 +360,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -359,7 +359,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: @@ -5808,7 +5824,7 @@ index 57c8af1859c1..2195ba488142 100644 tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; -@@ -1079,7 +1079,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +@@ -1078,7 +1078,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { @@ -5821,7 +5837,7 @@ index 57c8af1859c1..2195ba488142 100644 } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -@@ -1460,6 +1465,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, +@@ -1459,6 +1464,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); @@ -5839,7 +5855,7 @@ index 57c8af1859c1..2195ba488142 100644 /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep -@@ -3688,7 +3704,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +@@ -3687,7 +3703,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ @@ -5849,7 +5865,7 @@ index 57c8af1859c1..2195ba488142 100644 { struct tcp_sock *tp = tcp_sk(sk); -@@ -3705,6 +3722,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3704,6 +3721,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ @@ -5857,7 +5873,7 @@ index 57c8af1859c1..2195ba488142 100644 tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); -@@ -3715,6 +3733,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3714,6 +3732,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; @@ -5869,7 +5885,7 @@ index 57c8af1859c1..2195ba488142 100644 } } -@@ -3819,6 +3842,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3818,6 +3841,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); @@ -5877,7 +5893,7 @@ index 57c8af1859c1..2195ba488142 100644 /* ts_recent update must be made after we are sure that the packet * is in window. -@@ -3893,7 +3917,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3892,7 +3916,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) @@ -5886,7 +5902,7 @@ index 57c8af1859c1..2195ba488142 100644 if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | -@@ -3917,6 +3941,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3916,6 +3940,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); @@ -5894,7 +5910,7 @@ index 57c8af1859c1..2195ba488142 100644 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); -@@ -3936,7 +3961,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3935,7 +3960,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_ack_probe(sk); if (tp->tlp_high_seq) @@ -5903,7 +5919,7 @@ index 57c8af1859c1..2195ba488142 100644 return 1; old_ack: -@@ -5527,13 +5552,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5526,13 +5551,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5921,7 +5937,7 @@ index 57c8af1859c1..2195ba488142 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index c8f2aa003387..fdf51e436899 100644 +index c8f2aa0033871..fdf51e436899f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -440,6 +440,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -5934,7 +5950,7 @@ index c8f2aa003387..fdf51e436899 100644 const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 51d8638d4b4c..2fb064057868 100644 +index 9f9ca68c47026..9affccab1a942 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -325,10 +325,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) @@ -6045,7 +6061,7 @@ index 51d8638d4b4c..2fb064057868 100644 goto rearm_timer; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c -index a8f6d9d06f2e..8737f2134648 100644 +index a8f6d9d06f2eb..8737f21346481 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ @@ -6125,10 +6141,10 @@ index a8f6d9d06f2e..8737f2134648 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 206418b6d7c4..619069963ff0 100644 +index a9f6200f12f15..445c4df7406f7 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -626,6 +626,7 @@ void tcp_write_timer_handler(struct sock *sk) +@@ -642,6 +642,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } @@ -6139,9 +6155,9 @@ index 206418b6d7c4..619069963ff0 100644 -- 2.42.0 -From 41db757e2b0e00035bdd9692a6b5d143eac1d33e Mon Sep 17 00:00:00 2001 +From 824c9adf2297e355eb9d6067869825bb179f248e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 28 Aug 2023 14:01:56 +0200 +Date: Mon, 25 Sep 2023 18:09:53 +0200 Subject: [PATCH 4/7] cachy Signed-off-by: Peter Jung @@ -6161,7 +6177,7 @@ Signed-off-by: Peter Jung arch/arc/configs/tb10x_defconfig | 1 + arch/arc/configs/vdk_hs38_defconfig | 1 + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + - arch/x86/Kconfig.cpu | 427 ++- + arch/x86/Kconfig.cpu | 427 +- arch/x86/Makefile | 46 +- arch/x86/include/asm/pci.h | 6 + arch/x86/include/asm/vermagic.h | 74 + @@ -6172,16 +6188,16 @@ Signed-off-by: Peter Jung drivers/cpufreq/Kconfig.x86 | 2 - drivers/i2c/busses/Kconfig | 9 + drivers/i2c/busses/Makefile | 1 + - drivers/i2c/busses/i2c-nct6775.c | 647 ++++ + drivers/i2c/busses/i2c-nct6775.c | 648 ++ drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/md/dm-crypt.c | 5 + drivers/pci/controller/Makefile | 6 + - drivers/pci/controller/intel-nvme-remap.c | 462 +++ + drivers/pci/controller/intel-nvme-remap.c | 462 ++ drivers/pci/quirks.c | 101 + drivers/platform/x86/Kconfig | 24 + drivers/platform/x86/Makefile | 4 + - drivers/platform/x86/legion-laptop.c | 2783 +++++++++++++++++ - drivers/platform/x86/steamdeck.c | 523 ++++ + drivers/platform/x86/legion-laptop.c | 5858 +++++++++++++++++ + drivers/platform/x86/steamdeck.c | 523 ++ include/linux/mm.h | 2 +- include/linux/pagemap.h | 2 +- include/linux/user_namespace.h | 4 + @@ -6191,19 +6207,23 @@ Signed-off-by: Peter Jung kernel/sched/fair.c | 20 +- kernel/sysctl.c | 12 + kernel/user_namespace.c | 7 + + lib/scatterlist.c | 23 +- mm/Kconfig | 2 +- + mm/internal.h | 1 + + mm/list_lru.c | 4 + mm/page-writeback.c | 8 + + mm/page_alloc.c | 42 +- mm/swap.c | 5 + mm/vmpressure.c | 4 + - mm/vmscan.c | 8 + - 50 files changed, 5289 insertions(+), 54 deletions(-) + mm/vmscan.c | 28 +- + 54 files changed, 8419 insertions(+), 90 deletions(-) create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/pci/controller/intel-nvme-remap.c create mode 100644 drivers/platform/x86/legion-laptop.c create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index ac95d4c9666e..b3eecf5b94f4 100644 +index f23ec4dc6c4b9..851e413f89676 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4276,6 +4276,15 @@ @@ -6223,7 +6243,7 @@ index ac95d4c9666e..b3eecf5b94f4 100644 Safety option to keep boot IRQs enabled. This should never be necessary. diff --git a/Makefile b/Makefile -index 2fdd8b40b7e0..8a601d85cd3f 100644 +index 7545d2b0e7b71..4300dd7bd1356 100644 --- a/Makefile +++ b/Makefile @@ -831,6 +831,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) @@ -6249,7 +6269,7 @@ index 2fdd8b40b7e0..8a601d85cd3f 100644 KBUILD_CFLAGS += -Werror=date-time diff --git a/arch/arc/configs/axs101_defconfig b/arch/arc/configs/axs101_defconfig -index 81764160451f..2c15d3bf747a 100644 +index 81764160451f7..2c15d3bf747a9 100644 --- a/arch/arc/configs/axs101_defconfig +++ b/arch/arc/configs/axs101_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -6261,7 +6281,7 @@ index 81764160451f..2c15d3bf747a 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_defconfig b/arch/arc/configs/axs103_defconfig -index d5181275490e..7d868e148d9a 100644 +index d5181275490ed..7d868e148d9a4 100644 --- a/arch/arc/configs/axs103_defconfig +++ b/arch/arc/configs/axs103_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -6273,7 +6293,7 @@ index d5181275490e..7d868e148d9a 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/axs103_smp_defconfig b/arch/arc/configs/axs103_smp_defconfig -index 07c89281c2e3..1513324ddb00 100644 +index 07c89281c2e3a..1513324ddb008 100644 --- a/arch/arc/configs/axs103_smp_defconfig +++ b/arch/arc/configs/axs103_smp_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -6285,7 +6305,7 @@ index 07c89281c2e3..1513324ddb00 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/haps_hs_defconfig b/arch/arc/configs/haps_hs_defconfig -index 8c3ed5d6e6c3..2db643853e8f 100644 +index 8c3ed5d6e6c35..2db643853e8f4 100644 --- a/arch/arc/configs/haps_hs_defconfig +++ b/arch/arc/configs/haps_hs_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -6297,7 +6317,7 @@ index 8c3ed5d6e6c3..2db643853e8f 100644 CONFIG_PERF_EVENTS=y # CONFIG_COMPAT_BRK is not set diff --git a/arch/arc/configs/haps_hs_smp_defconfig b/arch/arc/configs/haps_hs_smp_defconfig -index 61107e8bac33..d764007e5ada 100644 +index 61107e8bac336..d764007e5adad 100644 --- a/arch/arc/configs/haps_hs_smp_defconfig +++ b/arch/arc/configs/haps_hs_smp_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -6309,7 +6329,7 @@ index 61107e8bac33..d764007e5ada 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/hsdk_defconfig b/arch/arc/configs/hsdk_defconfig -index 4ee2a1507b57..ce6a4431a76d 100644 +index 4ee2a1507b57f..ce6a4431a76dd 100644 --- a/arch/arc/configs/hsdk_defconfig +++ b/arch/arc/configs/hsdk_defconfig @@ -9,6 +9,7 @@ CONFIG_NAMESPACES=y @@ -6321,7 +6341,7 @@ index 4ee2a1507b57..ce6a4431a76d 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/nsim_700_defconfig b/arch/arc/configs/nsim_700_defconfig -index 3e9829775992..5044609540cc 100644 +index 3e98297759925..5044609540cc3 100644 --- a/arch/arc/configs/nsim_700_defconfig +++ b/arch/arc/configs/nsim_700_defconfig @@ -11,6 +11,7 @@ CONFIG_NAMESPACES=y @@ -6333,7 +6353,7 @@ index 3e9829775992..5044609540cc 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig -index 502c87f351c8..748c809d1c4c 100644 +index 502c87f351c87..748c809d1c4c6 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y @@ -6345,7 +6365,7 @@ index 502c87f351c8..748c809d1c4c 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_defconfig b/arch/arc/configs/nsimosci_hs_defconfig -index f721cc3997d0..205c32b0074c 100644 +index f721cc3997d02..205c32b0074ca 100644 --- a/arch/arc/configs/nsimosci_hs_defconfig +++ b/arch/arc/configs/nsimosci_hs_defconfig @@ -10,6 +10,7 @@ CONFIG_NAMESPACES=y @@ -6357,7 +6377,7 @@ index f721cc3997d0..205c32b0074c 100644 CONFIG_EMBEDDED=y CONFIG_PERF_EVENTS=y diff --git a/arch/arc/configs/nsimosci_hs_smp_defconfig b/arch/arc/configs/nsimosci_hs_smp_defconfig -index 1419fc946a08..2477b7c80977 100644 +index 1419fc946a083..2477b7c809771 100644 --- a/arch/arc/configs/nsimosci_hs_smp_defconfig +++ b/arch/arc/configs/nsimosci_hs_smp_defconfig @@ -8,6 +8,7 @@ CONFIG_IKCONFIG_PROC=y @@ -6369,7 +6389,7 @@ index 1419fc946a08..2477b7c80977 100644 # CONFIG_COMPAT_BRK is not set CONFIG_KPROBES=y diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index 941bbadd6bf2..e61132ba4f89 100644 +index 941bbadd6bf2c..e61132ba4f890 100644 --- a/arch/arc/configs/tb10x_defconfig +++ b/arch/arc/configs/tb10x_defconfig @@ -14,6 +14,7 @@ CONFIG_INITRAMFS_SOURCE="../tb10x-rootfs.cpio" @@ -6381,7 +6401,7 @@ index 941bbadd6bf2..e61132ba4f89 100644 # CONFIG_AIO is not set CONFIG_EMBEDDED=y diff --git a/arch/arc/configs/vdk_hs38_defconfig b/arch/arc/configs/vdk_hs38_defconfig -index d3ef189c75f8..922b1b24f518 100644 +index d3ef189c75f8b..922b1b24f5184 100644 --- a/arch/arc/configs/vdk_hs38_defconfig +++ b/arch/arc/configs/vdk_hs38_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y @@ -6393,7 +6413,7 @@ index d3ef189c75f8..922b1b24f518 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/arc/configs/vdk_hs38_smp_defconfig b/arch/arc/configs/vdk_hs38_smp_defconfig -index 944b347025fd..ed64319f7eb2 100644 +index 944b347025fd1..ed64319f7eb29 100644 --- a/arch/arc/configs/vdk_hs38_smp_defconfig +++ b/arch/arc/configs/vdk_hs38_smp_defconfig @@ -4,6 +4,7 @@ CONFIG_HIGH_RES_TIMERS=y @@ -6405,7 +6425,7 @@ index 944b347025fd..ed64319f7eb2 100644 CONFIG_PERF_EVENTS=y # CONFIG_VM_EVENT_COUNTERS is not set diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu -index 00468adf180f..46cc91cb622f 100644 +index 00468adf180f1..46cc91cb622fc 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 @@ -6938,7 +6958,7 @@ index 00468adf180f..46cc91cb622f 100644 config IA32_FEAT_CTL def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index fdc2e3abd615..63845db8bf8a 100644 +index fdc2e3abd6152..63845db8bf8a5 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -67,7 +67,7 @@ export BITS @@ -7002,7 +7022,7 @@ index fdc2e3abd615..63845db8bf8a 100644 KBUILD_CFLAGS += $(cflags-y) diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h -index b40c462b4af3..c4e66e60d559 100644 +index b40c462b4af36..c4e66e60d559d 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -27,6 +27,7 @@ struct pci_sysdata { @@ -7026,7 +7046,7 @@ index b40c462b4af3..c4e66e60d559 100644 already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h -index 75884d2cdec3..02c1386eb653 100644 +index 75884d2cdec37..02c1386eb653e 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,54 @@ @@ -7118,7 +7138,7 @@ index 75884d2cdec3..02c1386eb653 100644 #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c -index ddb798603201..7c20387d8202 100644 +index ddb798603201e..7c20387d82029 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void) @@ -7140,7 +7160,7 @@ index ddb798603201..7c20387d8202 100644 } -#endif diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 3cce6de464a7..9176bc4f07da 100644 +index 3cce6de464a7b..9176bc4f07daa 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7627,6 +7627,7 @@ MODULE_ALIAS("bfq-iosched"); @@ -7164,7 +7184,7 @@ index 3cce6de464a7..9176bc4f07da 100644 slab_kill: diff --git a/drivers/Makefile b/drivers/Makefile -index 7241d80a7b29..ac0ca3498f43 100644 +index 7241d80a7b293..ac0ca3498f43e 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -64,15 +64,8 @@ obj-y += char/ @@ -7199,10 +7219,10 @@ index 7241d80a7b29..ac0ca3498f43 100644 obj-$(CONFIG_MTD) += mtd/ obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c -index addba109406b..f819ee132ffa 100644 +index 7907b09fc27eb..a31562d18506d 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c -@@ -1522,7 +1522,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) +@@ -1524,7 +1524,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) } #endif @@ -7211,7 +7231,7 @@ index addba109406b..f819ee132ffa 100644 struct ahci_host_priv *hpriv) { int i; -@@ -1535,7 +1535,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1537,7 +1537,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, pci_resource_len(pdev, bar) < SZ_512K || bar != AHCI_PCI_BAR_STANDARD || !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) @@ -7220,7 +7240,7 @@ index addba109406b..f819ee132ffa 100644 cap = readq(hpriv->mmio + AHCI_REMAP_CAP); for (i = 0; i < AHCI_MAX_REMAP; i++) { -@@ -1550,18 +1550,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1552,18 +1552,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, } if (!hpriv->remapped_nvme) @@ -7243,7 +7263,7 @@ index addba109406b..f819ee132ffa 100644 } static int ahci_get_irq_vector(struct ata_host *host, int port) -@@ -1781,7 +1774,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +@@ -1783,7 +1776,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar]; /* detect remapped nvme devices */ @@ -7255,7 +7275,7 @@ index addba109406b..f819ee132ffa 100644 sysfs_add_file_to_group(&pdev->dev.kobj, &dev_attr_remapped_nvme.attr, diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 -index 438c9e75a04d..1bbfeca5f01e 100644 +index 438c9e75a04dc..1bbfeca5f01ec 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -9,7 +9,6 @@ config X86_INTEL_PSTATE @@ -7275,7 +7295,7 @@ index 438c9e75a04d..1bbfeca5f01e 100644 This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig -index 9cfe8fc509d7..efc3b0c0b4ad 100644 +index 9cfe8fc509d7d..efc3b0c0b4adb 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -229,6 +229,15 @@ config I2C_CHT_WC @@ -7295,7 +7315,7 @@ index 9cfe8fc509d7..efc3b0c0b4ad 100644 tristate "Nvidia nForce2, nForce3 and nForce4" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile -index af56fe2c75c0..76be74584719 100644 +index af56fe2c75c09..76be74584719e 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o @@ -7308,10 +7328,10 @@ index af56fe2c75c0..76be74584719 100644 obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 -index 000000000000..0462f0952043 +index 0000000000000..e919d1e10c515 --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c -@@ -0,0 +1,647 @@ +@@ -0,0 +1,648 @@ +/* + * i2c-nct6775 - Driver for the SMBus master functionality of + * Nuvoton NCT677x Super-I/O chips @@ -7533,6 +7553,7 @@ index 000000000000..0462f0952043 + break; + case I2C_SMBUS_BYTE_DATA: + tmp_data.byte = data->byte; ++ fallthrough; + case I2C_SMBUS_BYTE: + outb_p((addr << 1) | read_write, + SMBHSTADD); @@ -7960,7 +7981,7 @@ index 000000000000..0462f0952043 +module_init(i2c_nct6775_init); +module_exit(i2c_nct6775_exit); diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c -index 809fbd014cd6..d54b35b147ee 100644 +index 809fbd014cd68..d54b35b147ee9 100644 --- a/drivers/i2c/busses/i2c-piix4.c +++ b/drivers/i2c/busses/i2c-piix4.c @@ -568,11 +568,11 @@ static int piix4_transaction(struct i2c_adapter *piix4_adapter) @@ -7978,7 +7999,7 @@ index 809fbd014cd6..d54b35b147ee 100644 /* If the SMBus is still busy, we give up */ if (timeout == MAX_TIMEOUT) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 1dc6227d353e..bab1009ccef7 100644 +index 1dc6227d353ec..bab1009ccef79 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -3240,6 +3240,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) @@ -7994,7 +8015,7 @@ index 1dc6227d353e..bab1009ccef7 100644 if (ret < 0) goto bad; diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile -index 37c8663de7fe..897d19f92ede 100644 +index 37c8663de7fe1..897d19f92edeb 100644 --- a/drivers/pci/controller/Makefile +++ b/drivers/pci/controller/Makefile @@ -1,4 +1,10 @@ @@ -8010,7 +8031,7 @@ index 37c8663de7fe..897d19f92ede 100644 obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c new file mode 100644 -index 000000000000..e105e6f5cc91 +index 0000000000000..e105e6f5cc91d --- /dev/null +++ b/drivers/pci/controller/intel-nvme-remap.c @@ -0,0 +1,462 @@ @@ -8477,7 +8498,7 @@ index 000000000000..e105e6f5cc91 +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 321156ca273d..5dda26c737e2 100644 +index 321156ca273d5..5dda26c737e2c 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3718,6 +3718,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -8596,7 +8617,7 @@ index 321156ca273d..5dda26c737e2 100644 }; diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index 49c2c4cd8d00..956f4eff85b5 100644 +index 49c2c4cd8d000..956f4eff85b5b 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -643,6 +643,16 @@ config THINKPAD_LMI @@ -8638,17 +8659,17 @@ index 49c2c4cd8d00..956f4eff85b5 100644 config P2SB diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile -index 52dfdf574ac2..d32b6d87219f 100644 +index 52dfdf574ac2d..71f65ef04f9e1 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile -@@ -66,6 +66,7 @@ obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o +@@ -65,6 +65,7 @@ obj-$(CONFIG_LENOVO_YMC) += lenovo-ymc.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o obj-$(CONFIG_THINKPAD_LMI) += think-lmi.o - obj-$(CONFIG_YOGABOOK) += lenovo-yogabook.o +obj-$(CONFIG_LEGION_LAPTOP) += legion-laptop.o + obj-$(CONFIG_YOGABOOK) += lenovo-yogabook.o # Intel - obj-y += intel/ @@ -135,3 +136,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += simatic-ipc.o # Winmate @@ -8658,10 +8679,10 @@ index 52dfdf574ac2..d32b6d87219f 100644 +obj-$(CONFIG_STEAMDECK) += steamdeck.o diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c new file mode 100644 -index 000000000000..d1268d239cc5 +index 0000000000000..7275105071d27 --- /dev/null +++ b/drivers/platform/x86/legion-laptop.c -@@ -0,0 +1,2783 @@ +@@ -0,0 +1,5858 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * legion-laptop.c - Extra Lenovo Legion laptop support, in @@ -8721,12 +8742,14 @@ index 000000000000..d1268d239cc5 + * and commincation method with EC via ports + * - 0x1F9F1: additional reverse engineering for complete fan curve + */ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include ++#include +#include +#include +#include @@ -8753,6 +8776,12 @@ index 000000000000..d1268d239cc5 + ec_readonly, + "Only read from embedded controller but do not write or change settings."); + ++static bool enable_platformprofile = true; ++module_param(enable_platformprofile, bool, 0440); ++MODULE_PARM_DESC( ++ enable_platformprofile, ++ "Enable the platform profile sysfs API to read and write the power mode."); ++ +#define LEGIONFEATURES \ + "fancurve powermode platformprofile platformprofilenotify minifancurve" + @@ -8762,6 +8791,8 @@ index 000000000000..d1268d239cc5 +#define LEGION_DRVR_SHORTNAME "legion" +#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon" + ++struct legion_private; ++ +/* =============================== */ +/* Embedded Controller Description */ +/* =============================== */ @@ -8839,6 +8870,16 @@ index 000000000000..d1268d239cc5 + u16 EXT_GPU_TEMP_INPUT; +}; + ++enum access_method { ++ ACCESS_METHOD_NO_ACCESS = 0, ++ ACCESS_METHOD_EC = 1, ++ ACCESS_METHOD_ACPI = 2, ++ ACCESS_METHOD_WMI = 3, ++ ACCESS_METHOD_WMI2 = 4, ++ ACCESS_METHOD_WMI3 = 5, ++ ACCESS_METHOD_EC2 = 10, // ideapad fancurve method ++}; ++ +struct model_config { + const struct ec_register_offsets *registers; + bool check_embedded_controller_id; @@ -8850,10 +8891,24 @@ index 000000000000..d1268d239cc5 + + // TODO: maybe use bitfield + bool has_minifancurve; ++ bool has_custom_powermode; ++ enum access_method access_method_powermode; ++ ++ enum access_method access_method_keyboard; ++ enum access_method access_method_temperature; ++ enum access_method access_method_fanspeed; ++ enum access_method access_method_fancurve; ++ enum access_method access_method_fanfullspeed; ++ bool three_state_keyboard; ++ ++ bool acpi_check_dev; ++ ++ phys_addr_t ramio_physical_start; ++ size_t ramio_size; +}; + +/* =================================== */ -+/* Coinfiguration for different models */ ++/* Configuration for different models */ +/* =================================== */ + +// Idea by SmokelesssCPU (modified) @@ -8892,13 +8947,312 @@ index 000000000000..d1268d239cc5 + .EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400) +}; + ++static const struct ec_register_offsets ec_register_offsets_v1 = { ++ .ECHIPID1 = 0x2000, ++ .ECHIPID2 = 0x2001, ++ .ECHIPVER = 0x2002, ++ .ECDEBUG = 0x2003, ++ .EXT_FAN_CUR_POINT = 0xC534, ++ .EXT_FAN_POINTS_SIZE = 0xC535, ++ .EXT_FAN1_BASE = 0xC540, ++ .EXT_FAN2_BASE = 0xC550, ++ .EXT_FAN_ACC_BASE = 0xC560, ++ .EXT_FAN_DEC_BASE = 0xC570, ++ .EXT_CPU_TEMP = 0xC580, ++ .EXT_CPU_TEMP_HYST = 0xC590, ++ .EXT_GPU_TEMP = 0xC5A0, ++ .EXT_GPU_TEMP_HYST = 0xC5B0, ++ .EXT_VRM_TEMP = 0xC5C0, ++ .EXT_VRM_TEMP_HYST = 0xC5D0, ++ .EXT_FAN1_RPM_LSB = 0xC5E0, ++ .EXT_FAN1_RPM_MSB = 0xC5E1, ++ .EXT_FAN2_RPM_LSB = 0xC5E2, ++ .EXT_FAN2_RPM_MSB = 0xC5E3, ++ .EXT_MINIFANCURVE_ON_COOL = 0xC536, ++ .EXT_LOCKFANCONTROLLER = 0xc4AB, ++ .EXT_CPU_TEMP_INPUT = 0xc538, ++ .EXT_GPU_TEMP_INPUT = 0xc539, ++ .EXT_IC_TEMP_INPUT = 0xC5E8, ++ .EXT_POWERMODE = 0xc41D, ++ .EXT_FAN1_TARGET_RPM = 0xc600, ++ .EXT_FAN2_TARGET_RPM = 0xc601, ++ .EXT_MAXIMUMFANSPEED = 0xBD, ++ .EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400) ++}; ++ ++static const struct ec_register_offsets ec_register_offsets_ideapad_v0 = { ++ .ECHIPID1 = 0x2000, ++ .ECHIPID2 = 0x2001, ++ .ECHIPVER = 0x2002, ++ .ECDEBUG = 0x2003, ++ .EXT_FAN_CUR_POINT = 0xC5a0, // not found yet ++ .EXT_FAN_POINTS_SIZE = 0xC5a0, // constant 0 ++ .EXT_FAN1_BASE = 0xC5a0, ++ .EXT_FAN2_BASE = 0xC5a8, ++ .EXT_FAN_ACC_BASE = 0xC5a0, // not found yet ++ .EXT_FAN_DEC_BASE = 0xC5a0, // not found yet ++ .EXT_CPU_TEMP = 0xC550, // and repeated after 8 bytes ++ .EXT_CPU_TEMP_HYST = 0xC590, // and repeated after 8 bytes ++ .EXT_GPU_TEMP = 0xC5C0, // and repeated after 8 bytes ++ .EXT_GPU_TEMP_HYST = 0xC5D0, // and repeated after 8 bytes ++ .EXT_VRM_TEMP = 0xC5a0, // does not exists or not found ++ .EXT_VRM_TEMP_HYST = 0xC5a0, // does not exists ot not found yet ++ .EXT_FAN1_RPM_LSB = 0xC5a0, // not found yet ++ .EXT_FAN1_RPM_MSB = 0xC5a0, // not found yet ++ .EXT_FAN2_RPM_LSB = 0xC5a0, // not found yet ++ .EXT_FAN2_RPM_MSB = 0xC5a0, // not found yet ++ .EXT_MINIFANCURVE_ON_COOL = 0xC5a0, // does not exists or not found ++ .EXT_LOCKFANCONTROLLER = 0xC5a0, // does not exists or not found ++ .EXT_CPU_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_GPU_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_IC_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_POWERMODE = 0xC5a0, // not found yet ++ .EXT_FAN1_TARGET_RPM = 0xC5a0, // not found yet ++ .EXT_FAN2_TARGET_RPM = 0xC5a0, // not found yet ++ .EXT_MAXIMUMFANSPEED = 0xC5a0, // not found yet ++ .EXT_WHITE_KEYBOARD_BACKLIGHT = 0xC5a0 // not found yet ++}; ++ ++static const struct ec_register_offsets ec_register_offsets_ideapad_v1 = { ++ .ECHIPID1 = 0x2000, ++ .ECHIPID2 = 0x2001, ++ .ECHIPVER = 0x2002, ++ .ECDEBUG = 0x2003, ++ .EXT_FAN_CUR_POINT = 0xC5a0, // not found yet ++ .EXT_FAN_POINTS_SIZE = 0xC5a0, // constant 0 ++ .EXT_FAN1_BASE = 0xC5a0, ++ .EXT_FAN2_BASE = 0xC5a8, ++ .EXT_FAN_ACC_BASE = 0xC5a0, // not found yet ++ .EXT_FAN_DEC_BASE = 0xC5a0, // not found yet ++ .EXT_CPU_TEMP = 0xC550, // and repeated after 8 bytes ++ .EXT_CPU_TEMP_HYST = 0xC590, // and repeated after 8 bytes ++ .EXT_GPU_TEMP = 0xC5C0, // and repeated after 8 bytes ++ .EXT_GPU_TEMP_HYST = 0xC5D0, // and repeated after 8 bytes ++ .EXT_VRM_TEMP = 0xC5a0, // does not exists or not found ++ .EXT_VRM_TEMP_HYST = 0xC5a0, // does not exists ot not found yet ++ .EXT_FAN1_RPM_LSB = 0xC5a0, // not found yet ++ .EXT_FAN1_RPM_MSB = 0xC5a0, // not found yet ++ .EXT_FAN2_RPM_LSB = 0xC5a0, // not found yet ++ .EXT_FAN2_RPM_MSB = 0xC5a0, // not found yet ++ .EXT_MINIFANCURVE_ON_COOL = 0xC5a0, // does not exists or not found ++ .EXT_LOCKFANCONTROLLER = 0xC5a0, // does not exists or not found ++ .EXT_CPU_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_GPU_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_IC_TEMP_INPUT = 0xC5a0, // not found yet ++ .EXT_POWERMODE = 0xC5a0, // not found yet ++ .EXT_FAN1_TARGET_RPM = 0xC5a0, // not found yet ++ .EXT_FAN2_TARGET_RPM = 0xC5a0, // not found yet ++ .EXT_MAXIMUMFANSPEED = 0xC5a0, // not found yet ++ .EXT_WHITE_KEYBOARD_BACKLIGHT = 0xC5a0 // not found yet ++}; ++ +static const struct model_config model_v0 = { + .registers = &ec_register_offsets_v0, + .check_embedded_controller_id = true, + .embedded_controller_id = 0x8227, + .memoryio_physical_ec_start = 0xC400, + .memoryio_size = 0x300, -+ .has_minifancurve = true ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_j2cn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_9vcn = { ++ .registers = &ec_register_offsets_ideapad_v1, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8226, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_EC2, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_v2022 = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_4gcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8226, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_bvcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8226, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_NO_ACCESS, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFC7E0800, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_bhcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8226, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = false, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_ACPI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_ACPI, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFF00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_kwcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x5507, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI3, ++ .access_method_temperature = ACCESS_METHOD_WMI3, ++ .access_method_fancurve = ACCESS_METHOD_WMI3, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_m2cn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI3, ++ .access_method_temperature = ACCESS_METHOD_WMI3, ++ .access_method_fancurve = ACCESS_METHOD_WMI3, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_k1cn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x5263, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI3, ++ .access_method_temperature = ACCESS_METHOD_WMI3, ++ .access_method_fancurve = ACCESS_METHOD_WMI3, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_lpcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x5507, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI3, ++ .access_method_temperature = ACCESS_METHOD_WMI3, ++ .access_method_fancurve = ACCESS_METHOD_WMI3, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 +}; + +static const struct model_config model_kfcn = { @@ -8907,7 +9261,17 @@ index 000000000000..d1268d239cc5 + .embedded_controller_id = 0x8227, + .memoryio_physical_ec_start = 0xC400, + .memoryio_size = 0x300, -+ .has_minifancurve = false ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 +}; + +static const struct model_config model_hacn = { @@ -8916,19 +9280,221 @@ index 000000000000..d1268d239cc5 + .embedded_controller_id = 0x8227, + .memoryio_physical_ec_start = 0xC400, + .memoryio_size = 0x300, -+ .has_minifancurve = false ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 +}; + -+ +static const struct model_config model_k9cn = { + .registers = &ec_register_offsets_v0, + .check_embedded_controller_id = false, + .embedded_controller_id = 0x8227, + .memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400 ? + .memoryio_size = 0x300, -+ .has_minifancurve = false ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 +}; + ++static const struct model_config model_eucn = { ++ .registers = &ec_register_offsets_v1, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_fccn = { ++ .registers = &ec_register_offsets_ideapad_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_ACPI, ++ .access_method_fancurve = ACCESS_METHOD_EC2, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_h3cn = { ++ //0xFE0B0800 ++ .registers = &ec_register_offsets_v1, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = false, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ // not implemented (properly) in WMI, RGB conrolled by USB ++ .access_method_keyboard = ACCESS_METHOD_NO_ACCESS, ++ // accessing fan speed is not implemented in ACPI ++ // a variable in the operation region (or not found) ++ // and not per WMI (methods returns constant 0) ++ .access_method_fanspeed = ACCESS_METHOD_NO_ACCESS, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_NO_ACCESS, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFE0B0800, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_e9cn = { ++ //0xFE0B0800 ++ .registers = &ec_register_offsets_v1, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, //0xFC7E0800 ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = false, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ // not implemented (properly) in WMI, RGB conrolled by USB ++ .access_method_keyboard = ACCESS_METHOD_NO_ACCESS, ++ // accessing fan speed is not implemented in ACPI ++ // a variable in the operation region (or not found) ++ // and not per WMI (methods returns constant 0) ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_NO_ACCESS, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFC7E0800, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_8jcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8226, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFE00D400, ++ .ramio_size = 0x600 ++}; ++ ++static const struct model_config model_jncn = { ++ .registers = &ec_register_offsets_v1, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false, ++ .has_custom_powermode = false, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_NO_ACCESS, ++ .access_method_fanspeed = ACCESS_METHOD_WMI, ++ .access_method_temperature = ACCESS_METHOD_WMI, ++ .access_method_fancurve = ACCESS_METHOD_NO_ACCESS, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFC7E0800, ++ .ramio_size = 0x600 ++}; ++ ++// Yoga Model! ++static const struct model_config model_j1cn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 ++}; ++ ++// Yoga Model! ++static const struct model_config model_dmcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_WMI, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = true, ++ .ramio_physical_start = 0xFE700D00, ++ .ramio_size = 0x600 ++}; ++ ++// Yoga Model! ++static const struct model_config model_khcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true, ++ .has_custom_powermode = true, ++ .access_method_powermode = ACCESS_METHOD_EC, ++ .access_method_keyboard = ACCESS_METHOD_WMI, ++ .access_method_fanspeed = ACCESS_METHOD_EC, ++ .access_method_temperature = ACCESS_METHOD_EC, ++ .access_method_fancurve = ACCESS_METHOD_EC, ++ .access_method_fanfullspeed = ACCESS_METHOD_WMI, ++ .acpi_check_dev = false, ++ .ramio_physical_start = 0xFE0B0400, ++ .ramio_size = 0x600 ++}; + + +static const struct dmi_system_id denylist[] = { {} }; @@ -8953,7 +9519,7 @@ index 000000000000..d1268d239cc5 + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_BIOS_VERSION, "EUCN"), + }, -+ .driver_data = (void *)&model_v0 ++ .driver_data = (void *)&model_eucn + }, + { + // modelyear: 2020 @@ -9045,11 +9611,182 @@ index 000000000000..d1268d239cc5 + }, + .driver_data = (void *)&model_k9cn + }, ++ { ++ // e.g. IdeaPad Gaming 3 15ARH05 ++ .ident = "FCCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "FCCN"), ++ }, ++ .driver_data = (void *)&model_fccn ++ }, ++ { ++ // e.g. Ideapad Gaming 3 15ACH6 ++ .ident = "H3CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "H3CN"), ++ }, ++ .driver_data = (void *)&model_h3cn ++ }, ++ { ++ // e.g. IdeaPad Gaming 3 15ARH7 (2022) ++ .ident = "JNCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "JNCN"), ++ }, ++ .driver_data = (void *)&model_jncn ++ }, ++ { ++ // 2020, seems very different in ACPI dissassembly ++ .ident = "E9CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "E9CN"), ++ }, ++ .driver_data = (void *)&model_e9cn ++ }, ++ { ++ // e.g. Legion Y7000 (older version) ++ .ident = "8JCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "8JCN"), ++ }, ++ .driver_data = (void *)&model_8jcn ++ }, ++ { ++ // e.g. Legion 7i Pro 2023 ++ .ident = "KWCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "KWCN"), ++ }, ++ .driver_data = (void *)&model_kwcn ++ }, ++ { ++ // e.g. Legion Pro 5 2023 or R9000P ++ .ident = "LPCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "LPCN"), ++ }, ++ .driver_data = (void *)&model_lpcn ++ }, ++ { ++ // e.g. Lenovo Legion 5i/Y7000 2019 PG0 ++ .ident = "BHCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "BHCN"), ++ }, ++ .driver_data = (void *)&model_bhcn ++ }, ++ { ++ // e.g. Lenovo 7 16IAX7 ++ .ident = "K1CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "K1CN"), ++ }, ++ .driver_data = (void *)&model_k1cn ++ }, ++ { ++ // e.g. Legion Y720 ++ .ident = "4GCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "4GCN"), ++ }, ++ .driver_data = (void *)&model_4gcn ++ }, ++ { ++ // e.g. Legion Slim 5 16APH8 2023 ++ .ident = "M3CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "M3CN"), ++ }, ++ .driver_data = (void *)&model_lpcn ++ }, ++ { ++ // e.g. Legion Y7000p-1060 ++ .ident = "9VCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "9VCN"), ++ }, ++ .driver_data = (void *)&model_9vcn ++ }, ++ { ++ // e.g. Legion Y9000X ++ .ident = "JYCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "JYCN"), ++ }, ++ .driver_data = (void *)&model_v2022 ++ }, ++ { ++ // e.g. Legion Y740-15IRH, older model e.g. with GTX 1660 ++ .ident = "BVCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "BVCN"), ++ }, ++ .driver_data = (void *)&model_bvcn ++ }, ++ { ++ // e.g. Legion 5 Pro 16IAH7H with a RTX 3070 Ti ++ .ident = "J2CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "J2CN"), ++ }, ++ .driver_data = (void *)&model_j2cn ++ }, ++ { ++ // e.g. Lenovo Yoga 7 16IAH7 with GPU Intel DG2 Arc A370M ++ .ident = "J1CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "J1CN"), ++ }, ++ .driver_data = (void *)&model_j1cn ++ }, ++ { ++ // e.g. Legion Slim 5 16IRH8 (2023) with RTX 4070 ++ .ident = "M2CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "M2CN"), ++ }, ++ .driver_data = (void *)&model_m2cn ++ }, ++ { ++ // e.g. Yoga Slim 7-14ARE05 ++ .ident = "DMCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "DMCN"), ++ }, ++ .driver_data = (void *)&model_dmcn ++ }, ++ { ++ // e.g. Yoga Slim 7 Pro 14ARH7 ++ .ident = "KHCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "KHCN"), ++ }, ++ .driver_data = (void *)&model_khcn ++ }, + {} +}; + +/* ================================= */ -+/* ACPI access */ ++/* ACPI and WMI access */ +/* ================================= */ + +// function from ideapad-laptop.c @@ -9081,13 +9818,451 @@ index 000000000000..d1268d239cc5 +static int exec_sbmc(acpi_handle handle, unsigned long arg) +{ + // \_SB.PCI0.LPC0.EC0.VPC0.SBMC -+ return exec_simple_method(handle, "SBMC", arg); ++ return exec_simple_method(handle, "VPC0.SBMC", arg); +} + -+static int eval_qcho(acpi_handle handle, unsigned long *res) ++//static int eval_qcho(acpi_handle handle, unsigned long *res) ++//{ ++// // \_SB.PCI0.LPC0.EC0.QCHO ++// return eval_int(handle, "QCHO", res); ++//} ++ ++static int eval_gbmd(acpi_handle handle, unsigned long *res) ++{ ++ return eval_int(handle, "VPC0.GBMD", res); ++} ++ ++static int eval_spmo(acpi_handle handle, unsigned long *res) +{ + // \_SB.PCI0.LPC0.EC0.QCHO -+ return eval_int(handle, "QCHO", res); ++ return eval_int(handle, "VPC0.BTSM", res); ++} ++ ++static int acpi_process_buffer_to_ints(const char *id_name, int id_nr, ++ acpi_status status, ++ struct acpi_buffer *out_buffer, u8 *res, ++ size_t ressize) ++{ ++ // seto to NULL call kfree on NULL if next function call fails ++ union acpi_object *out = NULL; ++ size_t i; ++ int error = 0; ++ ++ if (ACPI_FAILURE(status)) { ++ pr_info("ACPI evaluation error for: %s:%d\n", id_name, id_nr); ++ error = -EFAULT; ++ goto err; ++ } ++ ++ out = out_buffer->pointer; ++ if (!out) { ++ pr_info("Unexpected ACPI result for %s:%d\n", id_name, id_nr); ++ error = -AE_ERROR; ++ goto err; ++ } ++ ++ if (out->type != ACPI_TYPE_BUFFER || out->buffer.length != ressize) { ++ pr_info("Unexpected ACPI result for %s:%d: expected type %d but got %d; expected length %lu but got %u;\n", ++ id_name, id_nr, ACPI_TYPE_BUFFER, out->type, ressize, ++ out->buffer.length); ++ error = -AE_ERROR; ++ goto err; ++ } ++ pr_info("ACPI result for %s:%d: ACPI buffer length: %u\n", id_name, ++ id_nr, out->buffer.length); ++ ++ for (i = 0; i < ressize; ++i) ++ res[i] = out->buffer.pointer[i]; ++ error = 0; ++ ++err: ++ kfree(out); ++ return error; ++} ++ ++//static int exec_ints(acpi_handle handle, const char *method_name, ++// struct acpi_object_list *params, u8 *res, size_t ressize) ++//{ ++// acpi_status status; ++// struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; ++ ++// status = acpi_evaluate_object(handle, (acpi_string)method_name, params, ++// &out_buffer); ++ ++// return acpi_process_buffer_to_ints(method_name, 0, status, &out_buffer, ++// res, ressize); ++//} ++ ++static int wmi_exec_ints(const char *guid, u8 instance, u32 method_id, ++ const struct acpi_buffer *params, u8 *res, ++ size_t ressize) ++{ ++ acpi_status status; ++ struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; ++ ++ status = wmi_evaluate_method(guid, instance, method_id, params, ++ &out_buffer); ++ return acpi_process_buffer_to_ints(guid, method_id, status, &out_buffer, ++ res, ressize); ++} ++ ++static int wmi_exec_int(const char *guid, u8 instance, u32 method_id, ++ const struct acpi_buffer *params, unsigned long *res) ++{ ++ acpi_status status; ++ struct acpi_buffer out_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; ++ // seto to NULL call kfree on NULL if next function call fails ++ union acpi_object *out = NULL; ++ int error = 0; ++ ++ status = wmi_evaluate_method(guid, instance, method_id, params, ++ &out_buffer); ++ ++ if (ACPI_FAILURE(status)) { ++ pr_info("WMI evaluation error for: %s:%d\n", guid, method_id); ++ error = -EFAULT; ++ goto err; ++ } ++ ++ out = out_buffer.pointer; ++ if (!out) { ++ pr_info("Unexpected ACPI result for %s:%d", guid, method_id); ++ error = -AE_ERROR; ++ goto err; ++ } ++ ++ if (out->type != ACPI_TYPE_INTEGER) { ++ pr_info("Unexpected ACPI result for %s:%d: expected type %d but got %d\n", ++ guid, method_id, ACPI_TYPE_INTEGER, out->type); ++ error = -AE_ERROR; ++ goto err; ++ } ++ ++ *res = out->integer.value; ++ error = 0; ++ ++err: ++ kfree(out); ++ return error; ++} ++ ++static int wmi_exec_noarg_int(const char *guid, u8 instance, u32 method_id, ++ unsigned long *res) ++{ ++ struct acpi_buffer params; ++ ++ params.length = 0; ++ params.pointer = NULL; ++ return wmi_exec_int(guid, instance, method_id, ¶ms, res); ++} ++ ++static int wmi_exec_noarg_ints(const char *guid, u8 instance, u32 method_id, ++ u8 *res, size_t ressize) ++{ ++ struct acpi_buffer params; ++ ++ params.length = 0; ++ params.pointer = NULL; ++ return wmi_exec_ints(guid, instance, method_id, ¶ms, res, ressize); ++} ++ ++static int wmi_exec_arg(const char *guid, u8 instance, u32 method_id, void *arg, ++ size_t arg_size) ++{ ++ struct acpi_buffer params; ++ acpi_status status; ++ ++ params.length = arg_size; ++ params.pointer = arg; ++ status = wmi_evaluate_method(guid, instance, method_id, ¶ms, NULL); ++ ++ if (ACPI_FAILURE(status)) ++ return -EIO; ++ return 0; ++} ++ ++/* ================================= */ ++/* Lenovo WMI config */ ++/* ================================= */ ++#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" ++// GPU over clock ++#define WMI_METHOD_ID_ISSUPPORTGPUOC 4 ++ ++//Fan speed ++// only completely implemented only for some models here ++// often implemted also in other class and other method ++// below ++#define WMI_METHOD_ID_GETFAN1SPEED 8 ++#define WMI_METHOD_ID_GETFAN2SPEED 9 ++ ++// Version of ACPI ++#define WMI_METHOD_ID_GETVERSION 11 ++// Does it support CPU overclock? ++#define WMI_METHOD_ID_ISSUPPORTCPUOC 14 ++// Temperatures ++// only completely implemented only for some models here ++// often implemted also in other class and other method ++// below ++#define WMI_METHOD_ID_GETCPUTEMP 18 ++#define WMI_METHOD_ID_GETGPUTEMP 19 ++ ++// two state keyboard light ++#define WMI_METHOD_ID_GETKEYBOARDLIGHT 37 ++#define WMI_METHOD_ID_SETKEYBOARDLIGHT 36 ++// disable win key ++// 0 = win key enabled; 1 = win key disabled ++#define WMI_METHOD_ID_ISSUPPORTDISABLEWINKEY 21 ++#define WMI_METHOD_ID_GETWINKEYSTATUS 23 ++#define WMI_METHOD_ID_SETWINKEYSTATUS 22 ++// disable touchpad ++//0 = touchpad enabled; 1 = touchpad disabled ++#define WMI_METHOD_ID_ISSUPPORTDISABLETP 24 ++#define WMI_METHOD_ID_GETTPSTATUS 26 ++#define WMI_METHOD_ID_SETTPSTATUS 25 ++// gSync ++#define WMI_METHOD_ID_ISSUPPORTGSYNC 40 ++#define WMI_METHOD_ID_GETGSYNCSTATUS 41 ++#define WMI_METHOD_ID_SETGSYNCSTATUS 42 ++//smartFanMode = powermode ++#define WMI_METHOD_ID_ISSUPPORTSMARTFAN 49 ++#define WMI_METHOD_ID_GETSMARTFANMODE 45 ++#define WMI_METHOD_ID_SETSMARTFANMODE 44 ++// power charge mode ++#define WMI_METHOD_ID_GETPOWERCHARGEMODE 47 ++// overdrive of display to reduce latency ++// 0=off, 1=on ++#define WMI_METHOD_ID_ISSUPPORTOD 49 ++#define WMI_METHOD_ID_GETODSTATUS 50 ++#define WMI_METHOD_ID_SETODSTATUS 51 ++// thermal mode = power mode used for cooling ++#define WMI_METHOD_ID_GETTHERMALMODE 55 ++// get max frequency of core 0 ++#define WMI_METHOD_ID_GETCPUMAXFREQUENCY 60 ++// check if AC adapter has enough power to overclock ++#define WMI_METHOD_ID_ISACFITFOROC 62 ++// set iGPU (GPU packaged with CPU) state ++#define WMI_METHOD_ID_ISSUPPORTIGPUMODE 63 ++#define WMI_METHOD_ID_GETIGPUMODESTATUS 64 ++#define WMI_METHOD_ID_SETIGPUMODESTATUS 65 ++#define WMI_METHOD_ID_NOTIFYDGPUSTATUS 66 ++enum IGPUState { ++ IGPUState_default = 0, ++ IGPUState_iGPUOnly = 1, ++ IGPUState_auto = 2 ++}; ++ ++#define WMI_GUID_LENOVO_CPU_METHOD "14afd777-106f-4c9b-b334-d388dc7809be" ++#define WMI_METHOD_ID_CPU_GET_SUPPORT_OC_STATUS 15 ++#define WMI_METHOD_ID_CPU_GET_OC_STATUS 1 ++#define WMI_METHOD_ID_CPU_SET_OC_STATUS 2 ++ ++// ppt limit slow ++#define WMI_METHOD_ID_CPU_GET_SHORTTERM_POWERLIMIT 3 ++#define WMI_METHOD_ID_CPU_SET_SHORTTERM_POWERLIMIT 4 ++// ppt stapm ++#define WMI_METHOD_ID_CPU_GET_LONGTERM_POWERLIMIT 5 ++#define WMI_METHOD_ID_CPU_SET_LONGTERM_POWERLIMIT 6 ++// default power limit ++#define WMI_METHOD_ID_CPU_GET_DEFAULT_POWERLIMIT 7 ++// peak power limit ++#define WMI_METHOD_ID_CPU_GET_PEAK_POWERLIMIT 8 ++#define WMI_METHOD_ID_CPU_SET_PEAK_POWERLIMIT 9 ++// apu sppt powerlimit ++#define WMI_METHOD_ID_CPU_GET_APU_SPPT_POWERLIMIT 12 ++#define WMI_METHOD_ID_CPU_SET_APU_SPPT_POWERLIMIT 13 ++// cross loading powerlimit ++#define WMI_METHOD_ID_CPU_GET_CROSS_LOADING_POWERLIMIT 16 ++#define WMI_METHOD_ID_CPU_SET_CROSS_LOADING_POWERLIMIT 17 ++ ++#define WMI_GUID_LENOVO_GPU_METHOD "da7547f1-824d-405f-be79-d9903e29ced7" ++// overclock GPU possible ++#define WMI_METHOD_ID_GPU_GET_OC_STATUS 1 ++#define WMI_METHOD_ID_GPU_SET_OC_STATUS 2 ++// dynamic boost power ++#define WMI_METHOD_ID_GPU_GET_PPAB_POWERLIMIT 3 ++#define WMI_METHOD_ID_GPU_SET_PPAB_POWERLIMIT 4 ++// configurable TGP (power) ++#define WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT 5 ++#define WMI_METHOD_ID_GPU_SET_CTGP_POWERLIMIT 6 ++// ppab/ctgp powerlimit ++#define WMI_METHOD_ID_GPU_GET_DEFAULT_PPAB_CTGP_POWERLIMIT 7 ++// temperature limit ++#define WMI_METHOD_ID_GPU_GET_TEMPERATURE_LIMIT 8 ++#define WMI_METHOD_ID_GPU_SET_TEMPERATURE_LIMIT 9 ++// boost clock ++#define WMI_METHOD_ID_GPU_GET_BOOST_CLOCK 10 ++ ++#define WMI_GUID_LENOVO_FAN_METHOD "92549549-4bde-4f06-ac04-ce8bf898dbaa" ++// set fan to maximal speed; dust cleaning mode ++// only works in custom power mode ++#define WMI_METHOD_ID_FAN_GET_FULLSPEED 1 ++#define WMI_METHOD_ID_FAN_SET_FULLSPEED 2 ++// max speed of fan ++#define WMI_METHOD_ID_FAN_GET_MAXSPEED 3 ++#define WMI_METHOD_ID_FAN_SET_MAXSPEED 4 ++// fan table in custom mode ++#define WMI_METHOD_ID_FAN_GET_TABLE 5 ++#define WMI_METHOD_ID_FAN_SET_TABLE 6 ++// get speed of fans ++#define WMI_METHOD_ID_FAN_GETCURRENTFANSPEED 7 ++// get temperatures of CPU and GPU used for controlling cooling ++#define WMI_METHOD_ID_FAN_GETCURRENTSENSORTEMPERATURE 8 ++ ++// do not implement following ++// #define WMI_METHOD_ID_Fan_SetCurrentFanSpeed 9 ++ ++#define LEGION_WMI_KBBACKLIGHT_GUID "8C5B9127-ECD4-4657-980F-851019F99CA5" ++// access the keyboard backlight with 3 states ++#define WMI_METHOD_ID_KBBACKLIGHTGET 0x1 ++#define WMI_METHOD_ID_KBBACKLIGHTSET 0x2 ++ ++// new method in newer methods to get or set most of the values ++// with the two methods GetFeatureValue or SetFeatureValue. ++// They are called like GetFeatureValue(feature_id) where ++// feature_id is a id for the feature ++#define LEGION_WMI_LENOVO_OTHER_METHOD_GUID \ ++ "dc2a8805-3a8c-41ba-a6f7-092e0089cd3b" ++#define WMI_METHOD_ID_GET_FEATURE_VALUE 17 ++#define WMI_METHOD_ID_SET_FEATURE_VALUE 18 ++ ++enum OtherMethodFeature { ++ OtherMethodFeature_U1 = 0x010000, //->PC00.LPCB.EC0.REJF ++ OtherMethodFeature_U2 = 0x0F0000, //->C00.PEG1.PXP._STA? ++ OtherMethodFeature_U3 = 0x030000, //->PC00.LPCB.EC0.FLBT? ++ OtherMethodFeature_CPU_SHORT_TERM_POWER_LIMIT = 0x01010000, ++ OtherMethodFeature_CPU_LONG_TERM_POWER_LIMIT = 0x01020000, ++ OtherMethodFeature_CPU_PEAK_POWER_LIMIT = 0x01030000, ++ OtherMethodFeature_CPU_TEMPERATURE_LIMIT = 0x01040000, ++ ++ OtherMethodFeature_APU_PPT_POWER_LIMIT = 0x01050000, ++ ++ OtherMethodFeature_CPU_CROSS_LOAD_POWER_LIMIT = 0x01060000, ++ OtherMethodFeature_CPU_L1_TAU = 0x01070000, ++ ++ OtherMethodFeature_GPU_POWER_BOOST = 0x02010000, ++ OtherMethodFeature_GPU_cTGP = 0x02020000, ++ OtherMethodFeature_GPU_TEMPERATURE_LIMIT = 0x02030000, ++ OtherMethodFeature_GPU_POWER_TARGET_ON_AC_OFFSET_FROM_BASELINE = ++ 0x02040000, ++ ++ OtherMethodFeature_FAN_SPEED_1 = 0x04030001, ++ OtherMethodFeature_FAN_SPEED_2 = 0x04030002, ++ ++ OtherMethodFeature_C_U1 = 0x05010000, ++ OtherMethodFeature_TEMP_CPU = 0x05040000, ++ OtherMethodFeature_TEMP_GPU = 0x05050000, ++}; ++ ++static ssize_t wmi_other_method_get_value(enum OtherMethodFeature feature_id, ++ int *value) ++{ ++ struct acpi_buffer params; ++ int error; ++ unsigned long res; ++ u32 param1 = feature_id; ++ ++ params.length = sizeof(param1); ++ params.pointer = ¶m1; ++ error = wmi_exec_int(LEGION_WMI_LENOVO_OTHER_METHOD_GUID, 0, ++ WMI_METHOD_ID_GET_FEATURE_VALUE, ¶ms, &res); ++ if (!error) ++ *value = res; ++ return error; ++} ++ ++/* =================================== */ ++/* EC RAM Access with memory mapped IO */ ++/* =================================== */ ++ ++struct ecram_memoryio { ++ // TODO: start of remapped memory in EC RAM is assumed to be 0 ++ // u16 ecram_start; ++ ++ // physical address of remapped IO, depends on model and firmware ++ phys_addr_t physical_start; ++ // start adress of region in ec memory ++ phys_addr_t physical_ec_start; ++ // virtual address of remapped IO ++ u8 *virtual_start; ++ // size of remapped access ++ size_t size; ++}; ++ ++/** ++ * physical_start : corresponds to EC RAM 0 inside EC ++ * size: size of remapped region ++ * ++ * strong exception safety ++ */ ++static ssize_t ecram_memoryio_init(struct ecram_memoryio *ec_memoryio, ++ phys_addr_t physical_start, ++ phys_addr_t physical_ec_start, size_t size) ++{ ++ void *virtual_start = ioremap(physical_start, size); ++ ++ if (!IS_ERR_OR_NULL(virtual_start)) { ++ ec_memoryio->virtual_start = virtual_start; ++ ec_memoryio->physical_start = physical_start; ++ ec_memoryio->physical_ec_start = physical_ec_start; ++ ec_memoryio->size = size; ++ pr_info("Succeffuly mapped embedded controller: 0x%llx (in RAM)/0x%llx (in EC) to virtual 0x%p\n", ++ ec_memoryio->physical_start, ++ ec_memoryio->physical_ec_start, ++ ec_memoryio->virtual_start); ++ } else { ++ pr_info("Error mapping embedded controller memory at 0x%llx\n", ++ physical_start); ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++static void ecram_memoryio_exit(struct ecram_memoryio *ec_memoryio) ++{ ++ if (ec_memoryio->virtual_start != NULL) { ++ pr_info("Unmapping embedded controller memory at 0x%llx (in RAM)/0x%llx (in EC) at virtual 0x%p\n", ++ ec_memoryio->physical_start, ++ ec_memoryio->physical_ec_start, ++ ec_memoryio->virtual_start); ++ iounmap(ec_memoryio->virtual_start); ++ ec_memoryio->virtual_start = NULL; ++ } ++} ++ ++/* Read a byte from the EC RAM. ++ * ++ * Return status because of commong signature for alle ++ * methods to access EC RAM. ++ */ ++static ssize_t ecram_memoryio_read(const struct ecram_memoryio *ec_memoryio, ++ u16 ec_offset, u8 *value) ++{ ++ if (ec_offset < ec_memoryio->physical_ec_start) { ++ pr_info("Unexpected read at offset %d into EC RAM\n", ++ ec_offset); ++ return -1; ++ } ++ *value = *(ec_memoryio->virtual_start + ++ (ec_offset - ec_memoryio->physical_ec_start)); ++ return 0; ++} ++ ++/* Write a byte to the EC RAM. ++ * ++ * Return status because of commong signature for alle ++ * methods to access EC RAM. ++ */ ++ssize_t ecram_memoryio_write(const struct ecram_memoryio *ec_memoryio, ++ u16 ec_offset, u8 value) ++{ ++ if (ec_offset < ec_memoryio->physical_ec_start) { ++ pr_info("Unexpected write at offset %d into EC RAM\n", ++ ec_offset); ++ return -1; ++ } ++ *(ec_memoryio->virtual_start + ++ (ec_offset - ec_memoryio->physical_ec_start)) = value; ++ return 0; +} + +/* ================================= */ @@ -9135,7 +10310,7 @@ index 000000000000..d1268d239cc5 + struct mutex io_port_mutex; +}; + -+ssize_t ecram_portio_init(struct ecram_portio *ec_portio) ++static ssize_t ecram_portio_init(struct ecram_portio *ec_portio) +{ + if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE, + ECRAM_PORTIO_NAME)) { @@ -9148,7 +10323,7 @@ index 000000000000..d1268d239cc5 + return 0; +} + -+void ecram_portio_exit(struct ecram_portio *ec_portio) ++static void ecram_portio_exit(struct ecram_portio *ec_portio) +{ + release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE); +} @@ -9158,7 +10333,8 @@ index 000000000000..d1268d239cc5 + * Return status because of commong signature for alle + * methods to access EC RAM. + */ -+ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, u8 *value) ++static ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, ++ u8 *value) +{ + mutex_lock(&ec_portio->io_port_mutex); + @@ -9188,7 +10364,8 @@ index 000000000000..d1268d239cc5 + * Return status because of commong signature for alle + * methods to access EC RAM. + */ -+ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, u8 value) ++static ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, ++ u8 value) +{ + mutex_lock(&ec_portio->io_port_mutex); + @@ -9210,6 +10387,8 @@ index 000000000000..d1268d239cc5 + outb(value, ECRAM_PORTIO_DATA_PORT); + + mutex_unlock(&ec_portio->io_port_mutex); ++ // TODO: remove this ++ //pr_info("Writing %d to addr %x\n", value, offset); + return 0; +} + @@ -9221,8 +10400,9 @@ index 000000000000..d1268d239cc5 + struct ecram_portio portio; +}; + -+ssize_t ecram_init(struct ecram *ecram, phys_addr_t memoryio_ec_physical_start, -+ size_t region_size) ++static ssize_t ecram_init(struct ecram *ecram, ++ phys_addr_t memoryio_ec_physical_start, ++ size_t region_size) +{ + ssize_t err; + @@ -9238,14 +10418,14 @@ index 000000000000..d1268d239cc5 + return err; +} + -+void ecram_exit(struct ecram *ecram) ++static void ecram_exit(struct ecram *ecram) +{ + pr_info("Unloading legion ecram\n"); + ecram_portio_exit(&ecram->portio); + pr_info("Unloading legion ecram done\n"); +} + -+/** ++/** Read from EC RAM + * ecram_offset address on the EC + */ +static u8 ecram_read(struct ecram *ecram, u16 ecram_offset) @@ -9277,7 +10457,7 @@ index 000000000000..d1268d239cc5 +/* Reads from EC */ +/* =============================== */ + -+u16 read_ec_id(struct ecram *ecram, const struct model_config *model) ++static u16 read_ec_id(struct ecram *ecram, const struct model_config *model) +{ + u8 id1 = ecram_read(ecram, model->registers->ECHIPID1); + u8 id2 = ecram_read(ecram, model->registers->ECHIPID2); @@ -9285,7 +10465,8 @@ index 000000000000..d1268d239cc5 + return (id1 << 8) + id2; +} + -+u16 read_ec_version(struct ecram *ecram, const struct model_config *model) ++static u16 read_ec_version(struct ecram *ecram, ++ const struct model_config *model) +{ + u8 vers = ecram_read(ecram, model->registers->ECHIPVER); + u8 debug = ecram_read(ecram, model->registers->ECDEBUG); @@ -9295,7 +10476,7 @@ index 000000000000..d1268d239cc5 + +/* ============================= */ +/* Data model for sensor values */ -+/* ============================ */ ++/* ============================= */ + +struct sensor_values { + u16 fan1_rpm; // current speed in rpm of fan 1 @@ -9317,232 +10498,9 @@ index 000000000000..d1268d239cc5 + SENSOR_FAN2_TARGET_RPM_ID = 7 +}; + -+static int read_sensor_values(struct ecram *ecram, -+ const struct model_config *model, -+ struct sensor_values *values) -+{ -+ values->fan1_target_rpm = -+ 100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM); -+ values->fan2_target_rpm = -+ 100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM); -+ -+ values->fan1_rpm = -+ ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) + -+ (((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB)) -+ << 8); -+ values->fan2_rpm = -+ ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) + -+ (((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB)) -+ << 8); -+ -+ values->cpu_temp_celsius = -+ ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT); -+ values->gpu_temp_celsius = -+ ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT); -+ values->ic_temp_celsius = -+ ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT); -+ -+ values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6); -+ values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7); -+ values->ic_temp_celsius = ecram_read(ecram, 0xC5E8); -+ -+ return 0; -+} -+ -+/* =============================== */ -+/* Behaviour changing functions */ -+/* =============================== */ -+ -+int read_powermode(struct ecram *ecram, const struct model_config *model) -+{ -+ return ecram_read(ecram, model->registers->EXT_POWERMODE); -+} -+ -+ssize_t write_powermode(struct ecram *ecram, const struct model_config *model, -+ u8 value) -+{ -+ if (!(value >= 0 && value <= 2)) { -+ pr_info("Unexpected power mode value ignored: %d\n", value); -+ return -ENOMEM; -+ } -+ ecram_write(ecram, model->registers->EXT_POWERMODE, value); -+ return 0; -+} -+ -+/** -+ * Shortly toggle powermode to a different mode -+ * and switch back, e.g. to reset fan curve. -+ */ -+void toggle_powermode(struct ecram *ecram, const struct model_config *model) -+{ -+ int old_powermode = read_powermode(ecram, model); -+ int next_powermode = old_powermode == 0 ? 1 : 0; -+ -+ write_powermode(ecram, model, next_powermode); -+ mdelay(1500); -+ write_powermode(ecram, model, old_powermode); -+} -+ -+#define lockfancontroller_ON 8 -+#define lockfancontroller_OFF 0 -+ -+ssize_t write_lockfancontroller(struct ecram *ecram, -+ const struct model_config *model, bool state) -+{ -+ u8 val = state ? lockfancontroller_ON : lockfancontroller_OFF; -+ -+ ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val); -+ return 0; -+} -+ -+int read_lockfancontroller(struct ecram *ecram, -+ const struct model_config *model, bool *state) -+{ -+ int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER); -+ -+ switch (value) { -+ case lockfancontroller_ON: -+ *state = true; -+ break; -+ case lockfancontroller_OFF: -+ *state = false; -+ break; -+ default: -+ pr_info("Unexpected value in lockfanspeed register:%d\n", -+ value); -+ return -1; -+ } -+ return 0; -+} -+ -+#define MAXIMUMFANSPEED_ON 0x40 -+#define MAXIMUMFANSPEED_OFF 0x00 -+ -+int read_maximumfanspeed(struct ecram *ecram, const struct model_config *model, -+ bool *state) -+{ -+ int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED); -+ -+ switch (value) { -+ case MAXIMUMFANSPEED_ON: -+ *state = true; -+ break; -+ case MAXIMUMFANSPEED_OFF: -+ *state = false; -+ break; -+ default: -+ pr_info("Unexpected value in maximumfanspeed register:%d\n", -+ value); -+ return -1; -+ } -+ return 0; -+} -+ -+ssize_t write_maximumfanspeed(struct ecram *ecram, -+ const struct model_config *model, bool state) -+{ -+ u8 val = state ? MAXIMUMFANSPEED_ON : MAXIMUMFANSPEED_OFF; -+ -+ ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val); -+ return 0; -+} -+ -+#define MINIFANCUVE_ON_COOL_ON 0x04 -+#define MINIFANCUVE_ON_COOL_OFF 0xA0 -+ -+int read_minifancurve(struct ecram *ecram, const struct model_config *model, -+ bool *state) -+{ -+ int value = -+ ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL); -+ -+ switch (value) { -+ case MINIFANCUVE_ON_COOL_ON: -+ *state = true; -+ break; -+ case MINIFANCUVE_ON_COOL_OFF: -+ *state = false; -+ break; -+ default: -+ pr_info("Unexpected value in MINIFANCURVE register:%d\n", -+ value); -+ return -1; -+ } -+ return 0; -+} -+ -+ssize_t write_minifancurve(struct ecram *ecram, -+ const struct model_config *model, bool state) -+{ -+ u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF; -+ -+ ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val); -+ return 0; -+} -+ -+#define KEYBOARD_BACKLIGHT_OFF 18 -+#define KEYBOARD_BACKLIGHT_ON1 21 -+#define KEYBOARD_BACKLIGHT_ON2 23 -+ -+int read_keyboard_backlight(struct ecram *ecram, -+ const struct model_config *model, int *state) -+{ -+ int value = ecram_read(ecram, -+ model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT); -+ -+ //switch (value) { -+ //case MINIFANCUVE_ON_COOL_ON: -+ // *state = true; -+ // break; -+ //case MINIFANCUVE_ON_COOL_OFF: -+ // *state = false; -+ // break; -+ //default: -+ // pr_info("Unexpected value in MINIFANCURVE register:%d\n", -+ // value); -+ // return -1; -+ //} -+ *state = value; -+ return 0; -+} -+ -+int write_keyboard_backlight(struct ecram *ecram, -+ const struct model_config *model, int state) -+{ -+ u8 val = state > 0 ? KEYBOARD_BACKLIGHT_ON1 : KEYBOARD_BACKLIGHT_OFF; -+ -+ ecram_write(ecram, model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT, val); -+ return 0; -+} -+ -+#define FCT_RAPID_CHARGE_ON 0x07 -+#define FCT_RAPID_CHARGE_OFF 0x08 -+#define RAPID_CHARGE_ON 0x0 -+#define RAPID_CHARGE_OFF 0x1 -+ -+int read_rapidcharge(acpi_handle acpihandle, int *state) -+{ -+ unsigned long result; -+ int err; -+ -+ err = eval_qcho(acpihandle, &result); -+ if (err) -+ return err; -+ -+ *state = result; -+ return 0; -+} -+ -+int write_rapidcharge(acpi_handle acpihandle, bool state) -+{ -+ unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON : -+ FCT_RAPID_CHARGE_OFF; -+ return exec_sbmc(acpihandle, fct_nr); -+} -+ +/* ============================= */ +/* Data model for fan curve */ -+/* ============================ */ ++/* ============================= */ + +struct fancurve_point { + // rpm1 devided by 100 @@ -9597,34 +10555,14 @@ index 000000000000..d1268d239cc5 + size_t current_point_i; +}; + -+// calculate derived values -+ -+int fancurve_get_cpu_deltahyst(struct fancurve_point *point) -+{ -+ return ((int)point->cpu_max_temp_celsius) - -+ ((int)point->cpu_min_temp_celsius); -+} -+ -+int fancurve_get_gpu_deltahyst(struct fancurve_point *point) -+{ -+ return ((int)point->gpu_max_temp_celsius) - -+ ((int)point->gpu_min_temp_celsius); -+} -+ -+int fancurve_get_ic_deltahyst(struct fancurve_point *point) -+{ -+ return ((int)point->ic_max_temp_celsius) - -+ ((int)point->ic_min_temp_celsius); -+} -+ +// validation functions + -+bool fancurve_is_valid_min_temp(int min_temp) ++static bool fancurve_is_valid_min_temp(int min_temp) +{ + return min_temp >= 0 && min_temp <= 127; +} + -+bool fancurve_is_valid_max_temp(int max_temp) ++static bool fancurve_is_valid_max_temp(int max_temp) +{ + return max_temp >= 0 && max_temp <= 127; +} @@ -9633,7 +10571,7 @@ index 000000000000..d1268d239cc5 +// - make hwmon implementation easier +// - keep fancurve valid, otherwise EC will not properly control fan + -+bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm) ++static bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm) +{ + bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500); + @@ -9642,7 +10580,7 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm) ++static bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm) +{ + bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500); + @@ -9653,7 +10591,8 @@ index 000000000000..d1268d239cc5 + +// TODO: remove { ... } from single line if body + -+bool fancurve_set_accel(struct fancurve *fancurve, int point_id, int accel) ++static bool fancurve_set_accel(struct fancurve *fancurve, int point_id, ++ int accel) +{ + bool valid = accel >= 2 && accel <= 5; + @@ -9662,7 +10601,8 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_decel(struct fancurve *fancurve, int point_id, int decel) ++static bool fancurve_set_decel(struct fancurve *fancurve, int point_id, ++ int decel) +{ + bool valid = decel >= 2 && decel <= 5; + @@ -9671,8 +10611,8 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id, ++ int value) +{ + bool valid = fancurve_is_valid_max_temp(value); + @@ -9682,8 +10622,8 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id, ++ int value) +{ + bool valid = fancurve_is_valid_max_temp(value); + @@ -9692,8 +10632,8 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id, ++ int value) +{ + bool valid = fancurve_is_valid_max_temp(value); + @@ -9702,8 +10642,8 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id, ++ int value) +{ + bool valid = fancurve_is_valid_max_temp(value); + @@ -9712,27 +10652,28 @@ index 000000000000..d1268d239cc5 + return valid; +} + -+bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id, ++ int value) +{ -+ bool valid = fancurve_is_valid_max_temp(value); ++ bool valid = fancurve_is_valid_min_temp(value); + + if (valid) + fancurve->points[point_id].gpu_min_temp_celsius = value; + return valid; +} + -+bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id, -+ int value) ++static bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id, ++ int value) +{ -+ bool valid = fancurve_is_valid_max_temp(value); ++ bool valid = fancurve_is_valid_min_temp(value); + + if (valid) + fancurve->points[point_id].ic_min_temp_celsius = value; + return valid; +} + -+bool fancurve_set_size(struct fancurve *fancurve, int size, bool init_values) ++static bool fancurve_set_size(struct fancurve *fancurve, int size, ++ bool init_values) +{ + bool valid = size >= 1 && size <= MAXFANCURVESIZE; + @@ -9756,6 +10697,612 @@ index 000000000000..d1268d239cc5 + return true; +} + ++static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve, ++ struct seq_file *s) ++{ ++ int i; ++ ++ seq_printf( ++ s, ++ "rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n"); ++ for (i = 0; i < fancurve->size; ++i) { ++ const struct fancurve_point *point = &fancurve->points[i]; ++ ++ seq_printf( ++ s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n", ++ point->rpm1_raw * 100, point->rpm2_raw * 100, ++ point->accel, point->decel, point->cpu_min_temp_celsius, ++ point->cpu_max_temp_celsius, ++ point->gpu_min_temp_celsius, ++ point->gpu_max_temp_celsius, point->ic_min_temp_celsius, ++ point->ic_max_temp_celsius); ++ } ++ return 0; ++} ++ ++struct light { ++ bool initialized; ++ struct led_classdev led; ++ unsigned int last_brightness; ++ u8 light_id; ++ unsigned int lower_limit; ++ unsigned int upper_limit; ++}; ++ ++/* ============================= */ ++/* Global and shared data between */ ++/* all calls to this module */ ++/* ============================= */ ++// Implemented like ideapad-laptop.c but currenlty still ++// wihtout dynamic memory allocation (instead global _priv) ++struct legion_private { ++ struct platform_device *platform_device; ++ // TODO: remove or keep? init? ++ struct acpi_device *adev; ++ ++ // Method to access ECRAM ++ struct ecram ecram; ++ // Configuration with registers an ECRAM access method ++ const struct model_config *conf; ++ ++ // TODO: maybe refactor an keep only local to each function ++ // last known fan curve ++ struct fancurve fancurve; ++ // configured fan curve from user space ++ struct fancurve fancurve_configured; ++ ++ // update lock, when partial values of fancurve are changed ++ struct mutex fancurve_mutex; ++ ++ //interfaces ++ struct dentry *debugfs_dir; ++ struct device *hwmon_dev; ++ struct platform_profile_handler platform_profile_handler; ++ ++ struct light kbd_bl; ++ struct light ylogo_light; ++ struct light iport_light; ++ ++ // TODO: remove? ++ bool loaded; ++ ++ // TODO: remove, only for reverse enginnering ++ struct ecram_memoryio ec_memoryio; ++}; ++ ++// shared between different drivers: WMI, platform and proteced by mutex ++static struct legion_private *legion_shared; ++static struct legion_private _priv; ++static DEFINE_MUTEX(legion_shared_mutex); ++ ++static int legion_shared_init(struct legion_private *priv) ++{ ++ int ret; ++ ++ mutex_lock(&legion_shared_mutex); ++ ++ if (!legion_shared) { ++ legion_shared = priv; ++ mutex_init(&legion_shared->fancurve_mutex); ++ ret = 0; ++ } else { ++ pr_warn("Found multiple platform devices\n"); ++ ret = -EINVAL; ++ } ++ ++ priv->loaded = true; ++ mutex_unlock(&legion_shared_mutex); ++ ++ return ret; ++} ++ ++static void legion_shared_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion shared\n"); ++ mutex_lock(&legion_shared_mutex); ++ ++ if (legion_shared == priv) ++ legion_shared = NULL; ++ ++ mutex_unlock(&legion_shared_mutex); ++ pr_info("Unloading legion shared done\n"); ++} ++ ++static int get_simple_wmi_attribute(struct legion_private *priv, ++ const char *guid, u8 instance, ++ u32 method_id, bool invert, ++ unsigned long scale, unsigned long *value) ++{ ++ unsigned long state = 0; ++ int err; ++ ++ if (scale == 0) { ++ pr_info("Scale cannot be 0\n"); ++ return -EINVAL; ++ } ++ err = wmi_exec_noarg_int(guid, instance, method_id, &state); ++ if (err) ++ return -EINVAL; ++ ++ // TODO: remove later ++ pr_info("%swith raw value: %ld\n", __func__, state); ++ ++ state = state * scale; ++ ++ if (invert) ++ state = !state; ++ *value = state; ++ return 0; ++} ++ ++static int get_simple_wmi_attribute_bool(struct legion_private *priv, ++ const char *guid, u8 instance, ++ u32 method_id, bool invert, ++ unsigned long scale, bool *value) ++{ ++ unsigned long int_val = *value; ++ int err = get_simple_wmi_attribute(priv, guid, instance, method_id, ++ invert, scale, &int_val); ++ *value = int_val; ++ return err; ++} ++ ++static int set_simple_wmi_attribute(struct legion_private *priv, ++ const char *guid, u8 instance, ++ u32 method_id, bool invert, int scale, ++ int state) ++{ ++ int err; ++ u8 in_param; ++ ++ if (scale == 0) { ++ pr_info("Scale cannot be 0\n"); ++ return -EINVAL; ++ } ++ ++ if (invert) ++ state = !state; ++ ++ in_param = state / scale; ++ ++ err = wmi_exec_arg(guid, instance, method_id, &in_param, ++ sizeof(in_param)); ++ return err; ++} ++ ++/* ============================= */ ++/* Sensor values reading/writing */ ++/* ============================= */ ++ ++static int ec_read_sensor_values(struct ecram *ecram, ++ const struct model_config *model, ++ struct sensor_values *values) ++{ ++ values->fan1_target_rpm = ++ 100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM); ++ values->fan2_target_rpm = ++ 100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM); ++ ++ values->fan1_rpm = ++ ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) + ++ (((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB)) ++ << 8); ++ values->fan2_rpm = ++ ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) + ++ (((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB)) ++ << 8); ++ ++ values->cpu_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT); ++ values->gpu_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT); ++ values->ic_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT); ++ ++ values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6); ++ values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7); ++ values->ic_temp_celsius = ecram_read(ecram, 0xC5E8); ++ ++ return 0; ++} ++ ++static ssize_t ec_read_temperature(struct ecram *ecram, ++ const struct model_config *model, ++ int sensor_id, int *temperature) ++{ ++ int err = 0; ++ unsigned long res; ++ ++ if (sensor_id == 0) { ++ res = ecram_read(ecram, 0xC5E6); ++ } else if (sensor_id == 1) { ++ res = ecram_read(ecram, 0xC5E7); ++ } else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ if (!err) ++ *temperature = res; ++ return err; ++} ++ ++static ssize_t ec_read_fanspeed(struct ecram *ecram, ++ const struct model_config *model, int fan_id, ++ int *fanspeed_rpm) ++{ ++ int err = 0; ++ unsigned long res; ++ ++ if (fan_id == 0) { ++ res = ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) + ++ (((int)ecram_read(ecram, ++ model->registers->EXT_FAN1_RPM_MSB)) ++ << 8); ++ } else if (fan_id == 1) { ++ res = ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) + ++ (((int)ecram_read(ecram, ++ model->registers->EXT_FAN2_RPM_MSB)) ++ << 8); ++ } else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ if (!err) ++ *fanspeed_rpm = res; ++ return err; ++} ++ ++// '\_SB.PCI0.LPC0.EC0.FANS ++#define ACPI_PATH_FAN_SPEED1 "FANS" ++// '\_SB.PCI0.LPC0.EC0.FA2S ++#define ACPI_PATH_FAN_SPEED2 "FA2S" ++ ++static ssize_t acpi_read_fanspeed(struct legion_private *priv, int fan_id, ++ int *value) ++{ ++ int err; ++ unsigned long acpi_value; ++ const char *acpi_path; ++ ++ if (fan_id == 0) { ++ acpi_path = ACPI_PATH_FAN_SPEED1; ++ } else if (fan_id == 1) { ++ acpi_path = ACPI_PATH_FAN_SPEED2; ++ } else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ err = eval_int(priv->adev->handle, acpi_path, &acpi_value); ++ if (!err) ++ *value = (int)acpi_value * 100; ++ return err; ++} ++ ++// '\_SB.PCI0.LPC0.EC0.CPUT ++#define ACPI_PATH_CPU_TEMP "CPUT" ++// '\_SB.PCI0.LPC0.EC0.GPUT ++#define ACPI_PATH_GPU_TEMP "GPUT" ++ ++static ssize_t acpi_read_temperature(struct legion_private *priv, int fan_id, ++ int *value) ++{ ++ int err; ++ unsigned long acpi_value; ++ const char *acpi_path; ++ ++ if (fan_id == 0) { ++ acpi_path = ACPI_PATH_CPU_TEMP; ++ } else if (fan_id == 1) { ++ acpi_path = ACPI_PATH_GPU_TEMP; ++ } else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ err = eval_int(priv->adev->handle, acpi_path, &acpi_value); ++ if (!err) ++ *value = (int)acpi_value; ++ return err; ++} ++ ++// fan_id: 0 or 1 ++static ssize_t wmi_read_fanspeed(int fan_id, int *fanspeed_rpm) ++{ ++ int err; ++ unsigned long res; ++ struct acpi_buffer params; ++ ++ params.length = 1; ++ params.pointer = &fan_id; ++ ++ err = wmi_exec_int(WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_GETCURRENTFANSPEED, ¶ms, &res); ++ ++ if (!err) ++ *fanspeed_rpm = res; ++ return err; ++} ++ ++//sensor_id: cpu = 0, gpu = 1 ++static ssize_t wmi_read_temperature(int sensor_id, int *temperature) ++{ ++ int err; ++ unsigned long res; ++ struct acpi_buffer params; ++ ++ if (sensor_id == 0) ++ sensor_id = 0x03; ++ else if (sensor_id == 1) ++ sensor_id = 0x04; ++ else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ ++ params.length = 1; ++ params.pointer = &sensor_id; ++ ++ err = wmi_exec_int(WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_GETCURRENTSENSORTEMPERATURE, ++ ¶ms, &res); ++ ++ if (!err) ++ *temperature = res; ++ return err; ++} ++ ++// fan_id: 0 or 1 ++static ssize_t wmi_read_fanspeed_gz(int fan_id, int *fanspeed_rpm) ++{ ++ int err; ++ u32 method_id; ++ unsigned long res; ++ ++ if (fan_id == 0) ++ method_id = WMI_METHOD_ID_GETFAN1SPEED; ++ else if (fan_id == 1) ++ method_id = WMI_METHOD_ID_GETFAN2SPEED; ++ else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, method_id, &res); ++ ++ if (!err) ++ *fanspeed_rpm = res; ++ return err; ++} ++ ++//sensor_id: cpu = 0, gpu = 1 ++static ssize_t wmi_read_temperature_gz(int sensor_id, int *temperature) ++{ ++ int err; ++ u32 method_id; ++ unsigned long res; ++ ++ if (sensor_id == 0) ++ method_id = WMI_METHOD_ID_GETCPUTEMP; ++ else if (sensor_id == 1) ++ method_id = WMI_METHOD_ID_GETGPUTEMP; ++ else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ ++ err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, method_id, &res); ++ ++ if (!err) ++ *temperature = res; ++ return err; ++} ++ ++// fan_id: 0 or 1 ++static ssize_t wmi_read_fanspeed_other(int fan_id, int *fanspeed_rpm) ++{ ++ int err; ++ enum OtherMethodFeature featured_id; ++ int res; ++ ++ if (fan_id == 0) ++ featured_id = OtherMethodFeature_FAN_SPEED_1; ++ else if (fan_id == 1) ++ featured_id = OtherMethodFeature_FAN_SPEED_2; ++ else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ ++ err = wmi_other_method_get_value(featured_id, &res); ++ ++ if (!err) ++ *fanspeed_rpm = res; ++ return err; ++} ++ ++//sensor_id: cpu = 0, gpu = 1 ++static ssize_t wmi_read_temperature_other(int sensor_id, int *temperature) ++{ ++ int err; ++ enum OtherMethodFeature featured_id; ++ int res; ++ ++ if (sensor_id == 0) ++ featured_id = OtherMethodFeature_TEMP_CPU; ++ else if (sensor_id == 1) ++ featured_id = OtherMethodFeature_TEMP_GPU; ++ else { ++ // TODO: use all correct error codes ++ return -EEXIST; ++ } ++ ++ err = wmi_other_method_get_value(featured_id, &res); ++ if (!err) ++ *temperature = res; ++ return err; ++} ++ ++static ssize_t read_fanspeed(struct legion_private *priv, int fan_id, ++ int *speed_rpm) ++{ ++ // TODO: use enums or function pointers? ++ switch (priv->conf->access_method_fanspeed) { ++ case ACCESS_METHOD_EC: ++ return ec_read_fanspeed(&priv->ecram, priv->conf, fan_id, ++ speed_rpm); ++ case ACCESS_METHOD_ACPI: ++ return acpi_read_fanspeed(priv, fan_id, speed_rpm); ++ case ACCESS_METHOD_WMI: ++ return wmi_read_fanspeed_gz(fan_id, speed_rpm); ++ case ACCESS_METHOD_WMI2: ++ return wmi_read_fanspeed(fan_id, speed_rpm); ++ case ACCESS_METHOD_WMI3: ++ return wmi_read_fanspeed_other(fan_id, speed_rpm); ++ default: ++ pr_info("No access method for fanspeed: %d\n", ++ priv->conf->access_method_fanspeed); ++ return -EINVAL; ++ } ++} ++ ++static ssize_t read_temperature(struct legion_private *priv, int sensor_id, ++ int *temperature) ++{ ++ // TODO: use enums or function pointers? ++ switch (priv->conf->access_method_temperature) { ++ case ACCESS_METHOD_EC: ++ return ec_read_temperature(&priv->ecram, priv->conf, sensor_id, ++ temperature); ++ case ACCESS_METHOD_ACPI: ++ return acpi_read_temperature(priv, sensor_id, temperature); ++ case ACCESS_METHOD_WMI: ++ return wmi_read_temperature_gz(sensor_id, temperature); ++ case ACCESS_METHOD_WMI2: ++ return wmi_read_temperature(sensor_id, temperature); ++ case ACCESS_METHOD_WMI3: ++ return wmi_read_temperature_other(sensor_id, temperature); ++ default: ++ pr_info("No access method for temperature: %d\n", ++ priv->conf->access_method_temperature); ++ return -EINVAL; ++ } ++} ++ ++/* ============================= */ ++/* Fancurve reading/writing */ ++/* ============================= */ ++ ++/* Fancurve from WMI ++ * This allows changing fewer parameters. ++ * It is only available on newer models. ++ */ ++ ++struct WMIFanTable { ++ u8 FSTM; //FSMD ++ u8 FSID; ++ u32 FSTL; //FSST ++ u16 FSS0; ++ u16 FSS1; ++ u16 FSS2; ++ u16 FSS3; ++ u16 FSS4; ++ u16 FSS5; ++ u16 FSS6; ++ u16 FSS7; ++ u16 FSS8; ++ u16 FSS9; ++} __packed; ++ ++struct WMIFanTableRead { ++ u32 FSFL; ++ u32 FSS0; ++ u32 FSS1; ++ u32 FSS2; ++ u32 FSS3; ++ u32 FSS4; ++ u32 FSS5; ++ u32 FSS6; ++ u32 FSS7; ++ u32 FSS8; ++ u32 FSS9; ++ u32 FSSA; ++} __packed; ++ ++static ssize_t wmi_read_fancurve_custom(const struct model_config *model, ++ struct fancurve *fancurve) ++{ ++ u8 buffer[88]; ++ int err; ++ ++ // The output buffer from the ACPI call is 88 bytes and larger ++ // than the returned object ++ pr_info("Size of object: %lu\n", sizeof(struct WMIFanTableRead)); ++ err = wmi_exec_noarg_ints(WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_GET_TABLE, buffer, ++ sizeof(buffer)); ++ print_hex_dump(KERN_INFO, "legion_laptop fan table wmi buffer", ++ DUMP_PREFIX_ADDRESS, 16, 1, buffer, sizeof(buffer), ++ true); ++ if (!err) { ++ struct WMIFanTableRead *fantable = ++ (struct WMIFanTableRead *)&buffer[0]; ++ fancurve->current_point_i = 0; ++ fancurve->size = 10; ++ fancurve->points[0].rpm1_raw = fantable->FSS0; ++ fancurve->points[1].rpm1_raw = fantable->FSS1; ++ fancurve->points[2].rpm1_raw = fantable->FSS2; ++ fancurve->points[3].rpm1_raw = fantable->FSS3; ++ fancurve->points[4].rpm1_raw = fantable->FSS4; ++ fancurve->points[5].rpm1_raw = fantable->FSS5; ++ fancurve->points[6].rpm1_raw = fantable->FSS6; ++ fancurve->points[7].rpm1_raw = fantable->FSS7; ++ fancurve->points[8].rpm1_raw = fantable->FSS8; ++ fancurve->points[9].rpm1_raw = fantable->FSS9; ++ //fancurve->points[10].rpm1_raw = fantable->FSSA; ++ } ++ return err; ++} ++ ++static ssize_t wmi_write_fancurve_custom(const struct model_config *model, ++ const struct fancurve *fancurve) ++{ ++ u8 buffer[0x20]; ++ int err; ++ ++ // The buffer is read like this in ACPI firmware ++ // ++ // CreateByteField (Arg2, Zero, FSTM) ++ // CreateByteField (Arg2, One, FSID) ++ // CreateDWordField (Arg2, 0x02, FSTL) ++ // CreateByteField (Arg2, 0x06, FSS0) ++ // CreateByteField (Arg2, 0x08, FSS1) ++ // CreateByteField (Arg2, 0x0A, FSS2) ++ // CreateByteField (Arg2, 0x0C, FSS3) ++ // CreateByteField (Arg2, 0x0E, FSS4) ++ // CreateByteField (Arg2, 0x10, FSS5) ++ // CreateByteField (Arg2, 0x12, FSS6) ++ // CreateByteField (Arg2, 0x14, FSS7) ++ // CreateByteField (Arg2, 0x16, FSS8) ++ // CreateByteField (Arg2, 0x18, FSS9) ++ ++ memset(buffer, 0, sizeof(buffer)); ++ buffer[0x06] = fancurve->points[0].rpm1_raw; ++ buffer[0x08] = fancurve->points[1].rpm1_raw; ++ buffer[0x0A] = fancurve->points[2].rpm1_raw; ++ buffer[0x0C] = fancurve->points[3].rpm1_raw; ++ buffer[0x0E] = fancurve->points[4].rpm1_raw; ++ buffer[0x10] = fancurve->points[5].rpm1_raw; ++ buffer[0x12] = fancurve->points[6].rpm1_raw; ++ buffer[0x14] = fancurve->points[7].rpm1_raw; ++ buffer[0x16] = fancurve->points[8].rpm1_raw; ++ buffer[0x18] = fancurve->points[9].rpm1_raw; ++ ++ print_hex_dump(KERN_INFO, "legion_laptop fan table wmi write buffer", ++ DUMP_PREFIX_ADDRESS, 16, 1, buffer, sizeof(buffer), ++ true); ++ err = wmi_exec_arg(WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_SET_TABLE, buffer, sizeof(buffer)); ++ return err; ++} ++ +/* Read the fan curve from the EC. + * + * In newer models (>=2022) there is an ACPI/WMI to read fan curve as @@ -9765,8 +11312,9 @@ index 000000000000..d1268d239cc5 + * It reads all points from EC memory, even if stored fancurve is smaller, so + * it can contain 0 entries. + */ -+static int read_fancurve(struct ecram *ecram, const struct model_config *model, -+ struct fancurve *fancurve) ++static int ec_read_fancurve_legion(struct ecram *ecram, ++ const struct model_config *model, ++ struct fancurve *fancurve) +{ + size_t i = 0; + @@ -9809,10 +11357,16 @@ index 000000000000..d1268d239cc5 + return 0; +} + -+static int write_fancurve(struct ecram *ecram, const struct model_config *model, -+ const struct fancurve *fancurve, bool write_size) ++static int ec_write_fancurve_legion(struct ecram *ecram, ++ const struct model_config *model, ++ const struct fancurve *fancurve, ++ bool write_size) +{ + size_t i; ++ ++ //TODO: remove again ++ pr_info("Set fancurve\n"); ++ + // Reset fan update counters (try to avoid any race conditions) + ecram_write(ecram, 0xC5FE, 0); + ecram_write(ecram, 0xC5FF, 0); @@ -9865,100 +11419,604 @@ index 000000000000..d1268d239cc5 + return 0; +} + -+static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve, -+ struct seq_file *s) -+{ -+ int i; ++#define FANCURVESIZE_IDEAPDAD 8 + -+ seq_printf( -+ s, -+ "rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n"); -+ for (i = 0; i < fancurve->size; ++i) { ++static int ec_read_fancurve_ideapad(struct ecram *ecram, ++ const struct model_config *model, ++ struct fancurve *fancurve) ++{ ++ size_t i = 0; ++ ++ for (i = 0; i < FANCURVESIZE_IDEAPDAD; ++i) { ++ struct fancurve_point *point = &fancurve->points[i]; ++ ++ point->rpm1_raw = ++ ecram_read(ecram, model->registers->EXT_FAN1_BASE + i); ++ point->rpm2_raw = ++ ecram_read(ecram, model->registers->EXT_FAN2_BASE + i); ++ ++ point->accel = 0; ++ point->decel = 0; ++ point->cpu_max_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_CPU_TEMP + i); ++ point->cpu_min_temp_celsius = ecram_read( ++ ecram, model->registers->EXT_CPU_TEMP_HYST + i); ++ point->gpu_max_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_GPU_TEMP + i); ++ point->gpu_min_temp_celsius = ecram_read( ++ ecram, model->registers->EXT_GPU_TEMP_HYST + i); ++ point->ic_max_temp_celsius = 0; ++ point->ic_min_temp_celsius = 0; ++ } ++ ++ // Do not trust that hardware; It might suddendly report ++ // a larger size, so clamp it. ++ fancurve->size = FANCURVESIZE_IDEAPDAD; ++ fancurve->current_point_i = ++ ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT); ++ fancurve->current_point_i = ++ min(fancurve->current_point_i, fancurve->size); ++ return 0; ++} ++ ++static int ec_write_fancurve_ideapad(struct ecram *ecram, ++ const struct model_config *model, ++ const struct fancurve *fancurve) ++{ ++ size_t i; ++ int valr1; ++ int valr2; ++ ++ // add this later: maybe other addresses needed ++ // therefore, fan curve might not be effective immediatley but ++ // only after temp change ++ // Reset fan update counters (try to avoid any race conditions) ++ ecram_write(ecram, 0xC5FE, 0); ++ ecram_write(ecram, 0xC5FF, 0); ++ for (i = 0; i < FANCURVESIZE_IDEAPDAD; ++i) { + const struct fancurve_point *point = &fancurve->points[i]; + -+ seq_printf( -+ s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n", -+ point->rpm1_raw * 100, point->rpm2_raw * 100, -+ point->accel, point->decel, point->cpu_min_temp_celsius, -+ point->cpu_max_temp_celsius, -+ point->gpu_min_temp_celsius, -+ point->gpu_max_temp_celsius, point->ic_min_temp_celsius, -+ point->ic_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_FAN1_BASE + i, ++ point->rpm1_raw); ++ valr1 = ecram_read(ecram, model->registers->EXT_FAN1_BASE + i); ++ ecram_write(ecram, model->registers->EXT_FAN2_BASE + i, ++ point->rpm2_raw); ++ valr2 = ecram_read(ecram, model->registers->EXT_FAN2_BASE + i); ++ pr_info("Writing fan1: %d; reading fan1: %d\n", point->rpm1_raw, ++ valr1); ++ pr_info("Writing fan2: %d; reading fan2: %d\n", point->rpm2_raw, ++ valr2); ++ ++ // write to memory and repeat 8 bytes later again ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP + i, ++ point->cpu_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP + 8 + i, ++ point->cpu_max_temp_celsius); ++ // write to memory and repeat 8 bytes later again ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i, ++ point->cpu_min_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + 8 + i, ++ point->cpu_min_temp_celsius); ++ // write to memory and repeat 8 bytes later again ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP + i, ++ point->gpu_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP + 8 + i, ++ point->gpu_max_temp_celsius); ++ // write to memory and repeat 8 bytes later again ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i, ++ point->gpu_min_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + 8 + i, ++ point->gpu_min_temp_celsius); ++ } ++ ++ // add this later: maybe other addresses needed ++ // therefore, fan curve might not be effective immediatley but ++ // only after temp change ++ // // Reset current fan level to 0, so algorithm in EC ++ // // selects fan curve point again and resetting hysterisis ++ // // effects ++ // ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0); ++ ++ // // Reset internal fan levels ++ // ecram_write(ecram, 0xC634, 0); // CPU ++ // ecram_write(ecram, 0xC635, 0); // GPU ++ // ecram_write(ecram, 0xC636, 0); // SENSOR ++ ++ return 0; ++} ++ ++static int read_fancurve(struct legion_private *priv, struct fancurve *fancurve) ++{ ++ // TODO: use enums or function pointers? ++ switch (priv->conf->access_method_fancurve) { ++ case ACCESS_METHOD_EC: ++ return ec_read_fancurve_legion(&priv->ecram, priv->conf, ++ fancurve); ++ case ACCESS_METHOD_EC2: ++ return ec_read_fancurve_ideapad(&priv->ecram, priv->conf, ++ fancurve); ++ case ACCESS_METHOD_WMI3: ++ return wmi_read_fancurve_custom(priv->conf, fancurve); ++ default: ++ pr_info("No access method for fancurve:%d\n", ++ priv->conf->access_method_fancurve); ++ return -EINVAL; ++ } ++} ++ ++static int write_fancurve(struct legion_private *priv, ++ const struct fancurve *fancurve, bool write_size) ++{ ++ // TODO: use enums or function pointers? ++ switch (priv->conf->access_method_fancurve) { ++ case ACCESS_METHOD_EC: ++ return ec_write_fancurve_legion(&priv->ecram, priv->conf, ++ fancurve, write_size); ++ case ACCESS_METHOD_EC2: ++ return ec_write_fancurve_ideapad(&priv->ecram, priv->conf, ++ fancurve); ++ case ACCESS_METHOD_WMI3: ++ return wmi_write_fancurve_custom(priv->conf, fancurve); ++ default: ++ pr_info("No access method for fancurve:%d\n", ++ priv->conf->access_method_fancurve); ++ return -EINVAL; ++ } ++} ++ ++#define MINIFANCUVE_ON_COOL_ON 0x04 ++#define MINIFANCUVE_ON_COOL_OFF 0xA0 ++ ++static int ec_read_minifancurve(struct ecram *ecram, ++ const struct model_config *model, bool *state) ++{ ++ int value = ++ ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL); ++ ++ switch (value) { ++ case MINIFANCUVE_ON_COOL_ON: ++ *state = true; ++ break; ++ case MINIFANCUVE_ON_COOL_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in MINIFANCURVE register:%d\n", ++ value); ++ return -1; + } + return 0; +} + -+/* ============================= */ -+/* Global and shared data between */ -+/* all calls to this module */ -+/* ============================ */ -+// Implemented like ideapad-laptop.c but currenlty still -+// wihtout dynamic memory allocation (instaed global _priv) -+ -+struct legion_private { -+ struct platform_device *platform_device; -+ // TODO: remove or keep? init? -+ // struct acpi_device *adev; -+ -+ // Method to access ECRAM -+ struct ecram ecram; -+ // Configuration with registers an ECRAM access method -+ const struct model_config *conf; -+ -+ // TODO: maybe refactor an keep only local to each function -+ // last known fan curve -+ struct fancurve fancurve; -+ // configured fan curve from user space -+ struct fancurve fancurve_configured; -+ -+ // update lock, when partial values of fancurve are changed -+ struct mutex fancurve_mutex; -+ -+ //interfaces -+ struct dentry *debugfs_dir; -+ struct device *hwmon_dev; -+ struct platform_profile_handler platform_profile_handler; -+ -+ // TODO: remove? -+ bool loaded; -+}; -+ -+// shared between different drivers: WMI, platform and proteced by mutex -+static struct legion_private *legion_shared; -+static struct legion_private _priv; -+static DEFINE_MUTEX(legion_shared_mutex); -+ -+static int legion_shared_init(struct legion_private *priv) ++static ssize_t ec_write_minifancurve(struct ecram *ecram, ++ const struct model_config *model, ++ bool state) +{ -+ int ret; ++ u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF; + -+ mutex_lock(&legion_shared_mutex); -+ -+ if (!legion_shared) { -+ legion_shared = priv; -+ mutex_init(&legion_shared->fancurve_mutex); -+ ret = 0; -+ } else { -+ pr_warn("Found multiple platform devices\n"); -+ ret = -EINVAL; -+ } -+ -+ priv->loaded = true; -+ mutex_unlock(&legion_shared_mutex); -+ -+ return ret; ++ ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val); ++ return 0; +} + -+static void legion_shared_exit(struct legion_private *priv) ++#define EC_LOCKFANCONTROLLER_ON 8 ++#define EC_LOCKFANCONTROLLER_OFF 0 ++ ++static ssize_t ec_write_lockfancontroller(struct ecram *ecram, ++ const struct model_config *model, ++ bool state) +{ -+ pr_info("Unloading legion shared\n"); -+ mutex_lock(&legion_shared_mutex); ++ u8 val = state ? EC_LOCKFANCONTROLLER_ON : EC_LOCKFANCONTROLLER_OFF; + -+ if (legion_shared == priv) -+ legion_shared = NULL; ++ ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val); ++ return 0; ++} + -+ mutex_unlock(&legion_shared_mutex); -+ pr_info("Unloading legion shared done\n"); ++static int ec_read_lockfancontroller(struct ecram *ecram, ++ const struct model_config *model, ++ bool *state) ++{ ++ int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER); ++ ++ switch (value) { ++ case EC_LOCKFANCONTROLLER_ON: ++ *state = true; ++ break; ++ case EC_LOCKFANCONTROLLER_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in lockfanspeed register:%d\n", ++ value); ++ return -1; ++ } ++ return 0; ++} ++ ++#define EC_FANFULLSPEED_ON 0x40 ++#define EC_FANFULLSPEED_OFF 0x00 ++ ++static int ec_read_fanfullspeed(struct ecram *ecram, ++ const struct model_config *model, bool *state) ++{ ++ int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED); ++ ++ switch (value) { ++ case EC_FANFULLSPEED_ON: ++ *state = true; ++ break; ++ case EC_FANFULLSPEED_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in maximumfanspeed register:%d\n", ++ value); ++ return -1; ++ } ++ return 0; ++} ++ ++static ssize_t ec_write_fanfullspeed(struct ecram *ecram, ++ const struct model_config *model, ++ bool state) ++{ ++ u8 val = state ? EC_FANFULLSPEED_ON : EC_FANFULLSPEED_OFF; ++ ++ ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val); ++ return 0; ++} ++ ++static ssize_t wmi_read_fanfullspeed(struct legion_private *priv, bool *state) ++{ ++ return get_simple_wmi_attribute_bool(priv, WMI_GUID_LENOVO_FAN_METHOD, ++ 0, WMI_METHOD_ID_FAN_GET_FULLSPEED, ++ false, 1, state); ++} ++ ++static ssize_t wmi_write_fanfullspeed(struct legion_private *priv, bool state) ++{ ++ return set_simple_wmi_attribute(priv, WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_SET_FULLSPEED, false, ++ 1, state); ++} ++ ++static ssize_t read_fanfullspeed(struct legion_private *priv, bool *state) ++{ ++ // TODO: use enums or function pointers? ++ switch (priv->conf->access_method_fanfullspeed) { ++ case ACCESS_METHOD_EC: ++ return ec_read_fanfullspeed(&priv->ecram, priv->conf, state); ++ case ACCESS_METHOD_WMI: ++ return wmi_read_fanfullspeed(priv, state); ++ default: ++ pr_info("No access method for fan full speed: %d\n", ++ priv->conf->access_method_fanfullspeed); ++ return -EINVAL; ++ } ++} ++ ++static ssize_t write_fanfullspeed(struct legion_private *priv, bool state) ++{ ++ ssize_t res; ++ ++ switch (priv->conf->access_method_fanfullspeed) { ++ case ACCESS_METHOD_EC: ++ res = ec_write_fanfullspeed(&priv->ecram, priv->conf, state); ++ return res; ++ case ACCESS_METHOD_WMI: ++ return wmi_write_fanfullspeed(priv, state); ++ default: ++ pr_info("No access method for fan full speed:%d\n", ++ priv->conf->access_method_fanfullspeed); ++ return -EINVAL; ++ } ++} ++ ++/* ============================= */ ++/* Power mode reading/writing */ ++/* ============================= */ ++ ++enum legion_ec_powermode { ++ LEGION_EC_POWERMODE_QUIET = 2, ++ LEGION_EC_POWERMODE_BALANCED = 0, ++ LEGION_EC_POWERMODE_PERFORMANCE = 1, ++ LEGION_EC_POWERMODE_CUSTOM = 3 ++}; ++ ++enum legion_wmi_powermode { ++ LEGION_WMI_POWERMODE_QUIET = 1, ++ LEGION_WMI_POWERMODE_BALANCED = 2, ++ LEGION_WMI_POWERMODE_PERFORMANCE = 3, ++ LEGION_WMI_POWERMODE_CUSTOM = 255 ++}; ++ ++enum legion_wmi_powermode ec_to_wmi_powermode(int ec_mode) ++{ ++ switch (ec_mode) { ++ case LEGION_EC_POWERMODE_QUIET: ++ return LEGION_WMI_POWERMODE_QUIET; ++ case LEGION_EC_POWERMODE_BALANCED: ++ return LEGION_WMI_POWERMODE_BALANCED; ++ case LEGION_EC_POWERMODE_PERFORMANCE: ++ return LEGION_WMI_POWERMODE_PERFORMANCE; ++ case LEGION_EC_POWERMODE_CUSTOM: ++ return LEGION_WMI_POWERMODE_CUSTOM; ++ default: ++ return LEGION_WMI_POWERMODE_BALANCED; ++ } ++} ++ ++enum legion_ec_powermode wmi_to_ec_powermode(enum legion_wmi_powermode wmi_mode) ++{ ++ switch (wmi_mode) { ++ case LEGION_WMI_POWERMODE_QUIET: ++ return LEGION_EC_POWERMODE_QUIET; ++ case LEGION_WMI_POWERMODE_BALANCED: ++ return LEGION_EC_POWERMODE_BALANCED; ++ case LEGION_WMI_POWERMODE_PERFORMANCE: ++ return LEGION_EC_POWERMODE_PERFORMANCE; ++ case LEGION_WMI_POWERMODE_CUSTOM: ++ return LEGION_EC_POWERMODE_CUSTOM; ++ default: ++ return LEGION_EC_POWERMODE_BALANCED; ++ } ++} ++ ++static ssize_t ec_read_powermode(struct legion_private *priv, int *powermode) ++{ ++ *powermode = ++ ecram_read(&priv->ecram, priv->conf->registers->EXT_POWERMODE); ++ return 0; ++} ++ ++static ssize_t ec_write_powermode(struct legion_private *priv, u8 value) ++{ ++ if (!((value >= 0 && value <= 2) || value == 255)) { ++ pr_info("Unexpected power mode value ignored: %d\n", value); ++ return -ENOMEM; ++ } ++ ecram_write(&priv->ecram, priv->conf->registers->EXT_POWERMODE, value); ++ return 0; ++} ++ ++static ssize_t acpi_read_powermode(struct legion_private *priv, int *powermode) ++{ ++ unsigned long acpi_powermode; ++ int err; ++ ++ // spmo method not alwasy available ++ // \_SB.PCI0.LPC0.EC0.SPMO ++ err = eval_spmo(priv->adev->handle, &acpi_powermode); ++ *powermode = (int)acpi_powermode; ++ return err; ++} ++ ++static ssize_t wmi_read_powermode(int *powermode) ++{ ++ int err; ++ unsigned long res; ++ ++ err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETSMARTFANMODE, &res); ++ ++ if (!err) ++ *powermode = res; ++ return err; ++} ++ ++static ssize_t wmi_write_powermode(u8 value) ++{ ++ if (!((value >= LEGION_WMI_POWERMODE_QUIET && ++ value <= LEGION_WMI_POWERMODE_PERFORMANCE) || ++ value == LEGION_WMI_POWERMODE_CUSTOM)) { ++ pr_info("Unexpected power mode value ignored: %d\n", value); ++ return -ENOMEM; ++ } ++ return wmi_exec_arg(LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETSMARTFANMODE, &value, ++ sizeof(value)); ++} ++ ++static ssize_t read_powermode(struct legion_private *priv, int *powermode) ++{ ++ ssize_t res; ++ ++ switch (priv->conf->access_method_powermode) { ++ case ACCESS_METHOD_EC: ++ res = ec_read_powermode(priv, powermode); ++ *powermode = ec_to_wmi_powermode(*powermode); ++ return res; ++ case ACCESS_METHOD_ACPI: ++ return acpi_read_powermode(priv, powermode); ++ case ACCESS_METHOD_WMI: ++ return wmi_read_powermode(powermode); ++ default: ++ pr_info("No access method for powermode:%d\n", ++ priv->conf->access_method_powermode); ++ return -EINVAL; ++ } ++} ++ ++static ssize_t write_powermode(struct legion_private *priv, ++ enum legion_wmi_powermode value) ++{ ++ ssize_t res; ++ ++ //TODO: remove again ++ pr_info("Set powermode\n"); ++ ++ switch (priv->conf->access_method_powermode) { ++ case ACCESS_METHOD_EC: ++ res = ec_write_powermode(priv, wmi_to_ec_powermode(value)); ++ return res; ++ case ACCESS_METHOD_WMI: ++ return wmi_write_powermode(value); ++ default: ++ pr_info("No access method for powermode:%d\n", ++ priv->conf->access_method_powermode); ++ return -EINVAL; ++ } ++} ++ ++/** ++ * Shortly toggle powermode to a different mode ++ * and switch back, e.g. to reset fan curve. ++ */ ++static void toggle_powermode(struct legion_private *priv) ++{ ++ int old_powermode; ++ int next_powermode; ++ ++ read_powermode(priv, &old_powermode); ++ next_powermode = old_powermode == 0 ? 1 : 0; ++ ++ write_powermode(priv, next_powermode); ++ mdelay(1500); ++ write_powermode(priv, old_powermode); ++} ++ ++/* ============================= */ ++/* Charging mode reading/writing */ ++/* ============================- */ ++ ++#define FCT_RAPID_CHARGE_ON 0x07 ++#define FCT_RAPID_CHARGE_OFF 0x08 ++#define RAPID_CHARGE_ON 0x0 ++#define RAPID_CHARGE_OFF 0x1 ++ ++static int acpi_read_rapidcharge(struct acpi_device *adev, bool *state) ++{ ++ unsigned long result; ++ int err; ++ ++ //also works? what is better? ++ /* ++ * err = eval_qcho(adev->handle, &result); ++ * if (err) ++ * return err; ++ * state = result; ++ * return 0; ++ */ ++ ++ err = eval_gbmd(adev->handle, &result); ++ if (err) ++ return err; ++ ++ *state = result & 0x04; ++ return 0; ++} ++ ++static int acpi_write_rapidcharge(struct acpi_device *adev, bool state) ++{ ++ int err; ++ unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON : ++ FCT_RAPID_CHARGE_OFF; ++ ++ err = exec_sbmc(adev->handle, fct_nr); ++ pr_info("Set rapidcharge to %d by calling %lu: result: %d\n", state, ++ fct_nr, err); ++ return err; ++} ++ ++/* ============================= */ ++/* Keyboard backlight read/write */ ++/* ============================= */ ++ ++static ssize_t legion_kbd_bl2_brightness_get(struct legion_private *priv) ++{ ++ unsigned long state = 0; ++ int err; ++ ++ err = wmi_exec_noarg_int(LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETKEYBOARDLIGHT, &state); ++ if (err) ++ return -EINVAL; ++ ++ return state; ++} ++ ++//static int legion_kbd_bl2_brightness_set(struct legion_private *priv, ++// unsigned int brightness) ++//{ ++// u8 in_param = brightness; ++ ++// return wmi_exec_arg(LEGION_WMI_GAMEZONE_GUID, 0, ++// WMI_METHOD_ID_SETKEYBOARDLIGHT, &in_param, ++// sizeof(in_param)); ++//} ++ ++//min: 1, max: 3 ++#define LIGHT_ID_KEYBOARD 0x00 ++//min: 0, max: 1 ++#define LIGHT_ID_YLOGO 0x03 ++//min: 1, max: 2 ++#define LIGHT_ID_IOPORT 0x05 ++ ++static int legion_wmi_light_get(struct legion_private *priv, u8 light_id, ++ unsigned int min_value, unsigned int max_value) ++{ ++ struct acpi_buffer params; ++ u8 in; ++ u8 result[2]; ++ u8 value; ++ int err; ++ ++ params.length = 1; ++ params.pointer = ∈ ++ in = light_id; ++ err = wmi_exec_ints(LEGION_WMI_KBBACKLIGHT_GUID, 0, ++ WMI_METHOD_ID_KBBACKLIGHTGET, ¶ms, result, ++ ARRAY_SIZE(result)); ++ if (err) { ++ pr_info("Error for WMI method call to get brightness\n"); ++ return -EIO; ++ } ++ ++ value = result[1]; ++ if (!(value >= min_value && value <= max_value)) { ++ pr_info("Error WMI call for reading brightness: expected a value between %u and %u, but got %d\n", ++ min_value, max_value, value); ++ return -EFAULT; ++ } ++ ++ return value - min_value; ++} ++ ++static int legion_wmi_light_set(struct legion_private *priv, u8 light_id, ++ unsigned int min_value, unsigned int max_value, ++ unsigned int brightness) ++{ ++ struct acpi_buffer buffer; ++ u8 in_buffer_param[8]; ++ unsigned long result; ++ int err; ++ ++ buffer.length = 3; ++ buffer.pointer = &in_buffer_param[0]; ++ in_buffer_param[0] = light_id; ++ in_buffer_param[1] = 0x01; ++ in_buffer_param[2] = ++ clamp(brightness + min_value, min_value, max_value); ++ ++ err = wmi_exec_int(LEGION_WMI_KBBACKLIGHT_GUID, 0, ++ WMI_METHOD_ID_KBBACKLIGHTSET, &buffer, &result); ++ if (err) { ++ pr_info("Error for WMI method call to set brightness on light: %d\n", ++ light_id); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++static int legion_kbd_bl_brightness_get(struct legion_private *priv) ++{ ++ return legion_wmi_light_get(priv, LIGHT_ID_KEYBOARD, 1, 3); ++} ++ ++static int legion_kbd_bl_brightness_set(struct legion_private *priv, ++ unsigned int brightness) ++{ ++ return legion_wmi_light_set(priv, LIGHT_ID_KEYBOARD, 1, 3, brightness); +} + +/* ============================= */ @@ -9982,43 +12040,180 @@ index 000000000000..d1268d239cc5 + +DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory); + ++static int debugfs_ecmemoryram_show(struct seq_file *s, void *unused) ++{ ++ struct legion_private *priv = s->private; ++ size_t offset; ++ ssize_t err; ++ u8 value; ++ ++ for (offset = 0; offset < priv->conf->ramio_size; ++offset) { ++ err = ecram_memoryio_read(&priv->ec_memoryio, offset, &value); ++ if (!err) ++ seq_write(s, &value, 1); ++ else ++ return -EACCES; ++ } ++ return 0; ++} ++ ++DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemoryram); ++ ++//TODO: make (almost) all methods static ++ ++static void seq_file_print_with_error(struct seq_file *s, const char *name, ++ ssize_t err, int value) ++{ ++ seq_printf(s, "%s error: %ld\n", name, err); ++ seq_printf(s, "%s: %d\n", name, value); ++} ++ +static int debugfs_fancurve_show(struct seq_file *s, void *unused) +{ + struct legion_private *priv = s->private; + bool is_minifancurve; + bool is_lockfancontroller; + bool is_maximumfanspeed; ++ bool is_rapidcharge = false; ++ int powermode; ++ int temperature; ++ int fanspeed; + int err; ++ unsigned long cfg; ++ struct fancurve wmi_fancurve; ++ //int kb_backlight; ++ ++ mutex_lock(&priv->fancurve_mutex); + + seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf)); + seq_printf(s, "EC Chip Version: %x\n", + read_ec_version(&priv->ecram, priv->conf)); + seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES); + seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly); -+ read_fancurve(&priv->ecram, priv->conf, &priv->fancurve); + -+ seq_printf(s, "minifancurve feature enabled: %d\n", ++ err = eval_int(priv->adev->handle, "VPC0._CFG", &cfg); ++ seq_printf(s, "ACPI CFG error: %d\n", err); ++ seq_printf(s, "ACPI CFG: %lu\n", cfg); ++ ++ seq_printf(s, "temperature access method: %d\n", ++ priv->conf->access_method_temperature); ++ err = read_temperature(priv, 0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature", err, temperature); ++ err = ec_read_temperature(&priv->ecram, priv->conf, 0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature EC", err, temperature); ++ err = acpi_read_temperature(priv, 0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature ACPI", err, temperature); ++ err = wmi_read_temperature_gz(0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature WMI", err, temperature); ++ err = wmi_read_temperature(0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature WMI2", err, temperature); ++ err = wmi_read_temperature_other(0, &temperature); ++ seq_file_print_with_error(s, "CPU temperature WMI3", err, temperature); ++ ++ err = read_temperature(priv, 1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature", err, temperature); ++ err = ec_read_temperature(&priv->ecram, priv->conf, 1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature EC", err, temperature); ++ err = acpi_read_temperature(priv, 1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature ACPI", err, temperature); ++ err = wmi_read_temperature_gz(1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature WMI", err, temperature); ++ err = wmi_read_temperature(1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature WMI2", err, temperature); ++ err = wmi_read_temperature_other(1, &temperature); ++ seq_file_print_with_error(s, "GPU temperature WMI3", err, temperature); ++ ++ seq_printf(s, "fan speed access method: %d\n", ++ priv->conf->access_method_fanspeed); ++ err = read_fanspeed(priv, 0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed", err, fanspeed); ++ err = ec_read_fanspeed(&priv->ecram, priv->conf, 0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed EC", err, fanspeed); ++ err = acpi_read_fanspeed(priv, 0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed ACPI", err, fanspeed); ++ err = wmi_read_fanspeed_gz(0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed WMI", err, fanspeed); ++ err = wmi_read_fanspeed(0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed WMI2", err, fanspeed); ++ err = wmi_read_fanspeed_other(0, &fanspeed); ++ seq_file_print_with_error(s, "1 fanspeed WMI3", err, fanspeed); ++ ++ err = read_fanspeed(priv, 1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed", err, fanspeed); ++ err = ec_read_fanspeed(&priv->ecram, priv->conf, 1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed EC", err, fanspeed); ++ err = acpi_read_fanspeed(priv, 1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed ACPI", err, fanspeed); ++ err = wmi_read_fanspeed_gz(1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed WMI", err, fanspeed); ++ err = wmi_read_fanspeed(1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed WMI2", err, fanspeed); ++ err = wmi_read_fanspeed_other(1, &fanspeed); ++ seq_file_print_with_error(s, "2 fanspeed WMI3", err, fanspeed); ++ ++ seq_printf(s, "powermode access method: %d\n", ++ priv->conf->access_method_powermode); ++ err = read_powermode(priv, &powermode); ++ seq_file_print_with_error(s, "powermode", err, powermode); ++ err = ec_read_powermode(priv, &powermode); ++ seq_file_print_with_error(s, "powermode EC", err, powermode); ++ err = acpi_read_powermode(priv, &powermode); ++ seq_file_print_with_error(s, "powermode ACPI", err, powermode); ++ err = wmi_read_powermode(&powermode); ++ seq_file_print_with_error(s, "powermode WMI", err, powermode); ++ seq_printf(s, "has custom powermode: %d\n", ++ priv->conf->has_custom_powermode); ++ ++ err = acpi_read_rapidcharge(priv->adev, &is_rapidcharge); ++ seq_printf(s, "ACPI rapidcharge error: %d\n", err); ++ seq_printf(s, "ACPI rapidcharge: %d\n", is_rapidcharge); ++ ++ seq_printf(s, "WMI backlight 2 state: %ld\n", ++ legion_kbd_bl2_brightness_get(priv)); ++ seq_printf(s, "WMI backlight 3 state: %d\n", ++ legion_kbd_bl_brightness_get(priv)); ++ ++ seq_printf(s, "WMI light IO port: %d\n", ++ legion_wmi_light_get(priv, LIGHT_ID_IOPORT, 0, 4)); ++ ++ seq_printf(s, "WMI light y logo/lid: %d\n", ++ legion_wmi_light_get(priv, LIGHT_ID_YLOGO, 0, 4)); ++ ++ seq_printf(s, "EC minifancurve feature enabled: %d\n", + priv->conf->has_minifancurve); -+ err = read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve); -+ seq_printf(s, "minifancurve on cool: %s\n", ++ err = ec_read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve); ++ seq_printf(s, "EC minifancurve on cool: %s\n", + err ? "error" : (is_minifancurve ? "true" : "false")); -+ err = read_lockfancontroller(&priv->ecram, priv->conf, -+ &is_lockfancontroller); -+ seq_printf(s, "lock fan controller: %s\n", ++ ++ err = ec_read_lockfancontroller(&priv->ecram, priv->conf, ++ &is_lockfancontroller); ++ seq_printf(s, "EC lockfancontroller error: %d\n", err); ++ seq_printf(s, "EC lockfancontroller: %s\n", + err ? "error" : (is_lockfancontroller ? "true" : "false")); -+ err = read_maximumfanspeed(&priv->ecram, priv->conf, ++ ++ err = read_fanfullspeed(priv, &is_maximumfanspeed); ++ seq_file_print_with_error(s, "fanfullspeed", err, is_maximumfanspeed); ++ ++ err = ec_read_fanfullspeed(&priv->ecram, priv->conf, + &is_maximumfanspeed); -+ seq_printf(s, "enable maximumfanspeed: %s\n", -+ err ? "error" : (is_maximumfanspeed ? "true" : "false")); -+ seq_printf(s, "enable maximumfanspeed status: %d\n", err); ++ seq_file_print_with_error(s, "fanfullspeed EC", err, ++ is_maximumfanspeed); + -+ seq_printf(s, "fan curve current point id: %ld\n", ++ read_fancurve(priv, &priv->fancurve); ++ seq_printf(s, "EC fan curve current point id: %ld\n", + priv->fancurve.current_point_i); -+ seq_printf(s, "fan curve points size: %ld\n", priv->fancurve.size); ++ seq_printf(s, "EC fan curve points size: %ld\n", priv->fancurve.size); + -+ seq_puts(s, "Current fan curve in hardware (embedded controller):\n"); ++ seq_puts(s, "Current fan curve in hardware:\n"); + fancurve_print_seqfile(&priv->fancurve, s); + seq_puts(s, "=====================\n"); ++ mutex_unlock(&priv->fancurve_mutex); ++ ++ seq_puts(s, "Current fan curve in hardware (WMI; might be empty)\n"); ++ wmi_fancurve.size = 0; ++ err = wmi_read_fancurve_custom(priv->conf, &wmi_fancurve); ++ fancurve_print_seqfile(&wmi_fancurve, s); ++ seq_puts(s, "=====================\n"); + return 0; +} + @@ -10040,6 +12235,8 @@ index 000000000000..d1268d239cc5 + &debugfs_fancurve_fops); + debugfs_create_file("ecmemory", 0444, dir, priv, + &debugfs_ecmemory_fops); ++ debugfs_create_file("ecmemoryram", 0444, dir, priv, ++ &debugfs_ecmemoryram_fops); + + priv->debugfs_dir = dir; +} @@ -10057,42 +12254,77 @@ index 000000000000..d1268d239cc5 +/* sysfs interface */ +/* ============================ */ + -+static ssize_t powermode_show(struct device *dev, struct device_attribute *attr, -+ char *buf) ++static int show_simple_wmi_attribute(struct device *dev, ++ struct device_attribute *attr, char *buf, ++ const char *guid, u8 instance, ++ u32 method_id, bool invert, ++ unsigned long scale) +{ -+ struct legion_private *priv = dev_get_drvdata(dev); -+ int power_mode = read_powermode(&priv->ecram, priv->conf); -+ -+ return sysfs_emit(buf, "%d\n", power_mode); -+} -+ -+static ssize_t powermode_store(struct device *dev, -+ struct device_attribute *attr, const char *buf, -+ size_t count) -+{ -+ struct legion_private *priv = dev_get_drvdata(dev); -+ int powermode; ++ unsigned long state = 0; + int err; ++ struct legion_private *priv = dev_get_drvdata(dev); + -+ err = kstrtouint(buf, 0, &powermode); -+ if (err) -+ return err; ++ mutex_lock(&priv->fancurve_mutex); ++ err = get_simple_wmi_attribute(priv, guid, instance, method_id, invert, ++ scale, &state); ++ mutex_unlock(&priv->fancurve_mutex); + -+ err = write_powermode(&priv->ecram, priv->conf, powermode); + if (err) + return -EINVAL; + -+ // TODO: better? -+ // we have to wait a bit before change is done in hardware and -+ // readback done after notifying returns correct value, otherwise -+ // the notified reader will read old value -+ msleep(500); -+ platform_profile_notify(); -+ -+ return count; ++ return sysfs_emit(buf, "%lu\n", state); +} + -+static DEVICE_ATTR_RW(powermode); ++static int show_simple_wmi_attribute_from_buffer(struct device *dev, ++ struct device_attribute *attr, ++ char *buf, const char *guid, ++ u8 instance, u32 method_id, ++ size_t ressize, size_t i, ++ int scale) ++{ ++ u8 res[16]; ++ int err; ++ int out; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ if (ressize > ARRAY_SIZE(res)) { ++ pr_info("Buffer to small for WMI result\n"); ++ return -EINVAL; ++ } ++ if (i >= ressize) { ++ pr_info("Index not within buffer size\n"); ++ return -EINVAL; ++ } ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = wmi_exec_noarg_ints(guid, instance, method_id, res, ressize); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ out = scale * res[i]; ++ return sysfs_emit(buf, "%d\n", out); ++} ++ ++static int store_simple_wmi_attribute(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count, ++ const char *guid, u8 instance, ++ u32 method_id, bool invert, int scale) ++{ ++ int state; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ err = kstrtouint(buf, 0, &state); ++ if (err) ++ return err; ++ err = set_simple_wmi_attribute(priv, guid, instance, method_id, invert, ++ scale, state); ++ if (err) ++ return err; ++ return count; ++} + +static ssize_t lockfancontroller_show(struct device *dev, + struct device_attribute *attr, char *buf) @@ -10102,8 +12334,8 @@ index 000000000000..d1268d239cc5 + int err; + + mutex_lock(&priv->fancurve_mutex); -+ err = read_lockfancontroller(&priv->ecram, priv->conf, -+ &is_lockfancontroller); ++ err = ec_read_lockfancontroller(&priv->ecram, priv->conf, ++ &is_lockfancontroller); + mutex_unlock(&priv->fancurve_mutex); + if (err) + return -EINVAL; @@ -10124,8 +12356,8 @@ index 000000000000..d1268d239cc5 + return err; + + mutex_lock(&priv->fancurve_mutex); -+ err = write_lockfancontroller(&priv->ecram, priv->conf, -+ is_lockfancontroller); ++ err = ec_write_lockfancontroller(&priv->ecram, priv->conf, ++ is_lockfancontroller); + mutex_unlock(&priv->fancurve_mutex); + if (err) + return -EINVAL; @@ -10135,19 +12367,25 @@ index 000000000000..d1268d239cc5 + +static DEVICE_ATTR_RW(lockfancontroller); + -+static ssize_t keyboard_backlight_show(struct device *dev, -+ struct device_attribute *attr, char *buf) ++static ssize_t rapidcharge_show(struct device *dev, ++ struct device_attribute *attr, char *buf) +{ -+ int state; ++ bool state = false; ++ int err; + struct legion_private *priv = dev_get_drvdata(dev); + -+ read_keyboard_backlight(&priv->ecram, priv->conf, &state); ++ mutex_lock(&priv->fancurve_mutex); ++ err = acpi_read_rapidcharge(priv->adev, &state); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ + return sysfs_emit(buf, "%d\n", state); +} + -+static ssize_t keyboard_backlight_store(struct device *dev, -+ struct device_attribute *attr, -+ const char *buf, size_t count) ++static ssize_t rapidcharge_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) +{ + struct legion_private *priv = dev_get_drvdata(dev); + int state; @@ -10157,18 +12395,571 @@ index 000000000000..d1268d239cc5 + if (err) + return err; + -+ err = write_keyboard_backlight(&priv->ecram, priv->conf, state); ++ mutex_lock(&priv->fancurve_mutex); ++ err = acpi_write_rapidcharge(priv->adev, state); ++ mutex_unlock(&priv->fancurve_mutex); + if (err) + return -EINVAL; + + return count; +} + -+static DEVICE_ATTR_RW(keyboard_backlight); ++static DEVICE_ATTR_RW(rapidcharge); ++ ++static ssize_t issupportgpuoc_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_ISSUPPORTGPUOC, false, ++ 1); ++} ++ ++static DEVICE_ATTR_RO(issupportgpuoc); ++ ++static ssize_t aslcodeversion_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETVERSION, false, 1); ++} ++ ++static DEVICE_ATTR_RO(aslcodeversion); ++ ++static ssize_t issupportcpuoc_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_ISSUPPORTCPUOC, false, ++ 1); ++} ++ ++static DEVICE_ATTR_RO(issupportcpuoc); ++ ++static ssize_t winkey_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETWINKEYSTATUS, true, ++ 1); ++} ++ ++static ssize_t winkey_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETWINKEYSTATUS, true, ++ 1); ++} ++ ++static DEVICE_ATTR_RW(winkey); ++ ++// on newer models the touchpad feature in ideapad does not work anymore, so ++// we need this ++static ssize_t touchpad_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETTPSTATUS, true, 1); ++} ++ ++static ssize_t touchpad_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETTPSTATUS, true, 1); ++} ++ ++static DEVICE_ATTR_RW(touchpad); ++ ++static ssize_t gsync_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETGSYNCSTATUS, true, 1); ++} ++ ++static ssize_t gsync_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETGSYNCSTATUS, true, ++ 1); ++} ++ ++static DEVICE_ATTR_RW(gsync); ++ ++static ssize_t powerchargemode_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETPOWERCHARGEMODE, ++ false, 1); ++} ++static DEVICE_ATTR_RO(powerchargemode); ++ ++static ssize_t overdrive_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETODSTATUS, false, 1); ++} ++ ++static ssize_t overdrive_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETODSTATUS, false, 1); ++} ++ ++static DEVICE_ATTR_RW(overdrive); ++ ++static ssize_t thermalmode_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETTHERMALMODE, false, ++ 1); ++} ++static DEVICE_ATTR_RO(thermalmode); ++ ++// TOOD: probably remove again because provided by other means; only useful for overclocking ++static ssize_t cpumaxfrequency_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETCPUMAXFREQUENCY, ++ false, 1); ++} ++static DEVICE_ATTR_RO(cpumaxfrequency); ++ ++static ssize_t isacfitforoc_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_ISACFITFOROC, false, 1); ++} ++static DEVICE_ATTR_RO(isacfitforoc); ++ ++static ssize_t igpumode_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_GETIGPUMODESTATUS, false, ++ 1); ++} ++ ++static ssize_t igpumode_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ LEGION_WMI_GAMEZONE_GUID, 0, ++ WMI_METHOD_ID_SETIGPUMODESTATUS, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(igpumode); ++ ++static ssize_t cpu_oc_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_OC_STATUS, 16, 0, 1); ++} ++ ++static ssize_t cpu_oc_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_OC_STATUS, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_oc); ++ ++static ssize_t cpu_shortterm_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_SHORTTERM_POWERLIMIT, 16, 0, 1); ++} ++ ++static ssize_t cpu_shortterm_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute( ++ dev, attr, buf, count, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_SHORTTERM_POWERLIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_shortterm_powerlimit); ++ ++static ssize_t cpu_longterm_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_LONGTERM_POWERLIMIT, 16, 0, 1); ++} ++ ++static ssize_t cpu_longterm_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute( ++ dev, attr, buf, count, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_LONGTERM_POWERLIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_longterm_powerlimit); ++ ++static ssize_t cpu_default_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute( ++ dev, attr, buf, WMI_GUID_LENOVO_CPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_DEFAULT_POWERLIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RO(cpu_default_powerlimit); ++ ++static ssize_t cpu_peak_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_PEAK_POWERLIMIT, ++ false, 1); ++} ++ ++static ssize_t cpu_peak_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_PEAK_POWERLIMIT, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_peak_powerlimit); ++ ++static ssize_t cpu_apu_sppt_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_APU_SPPT_POWERLIMIT, false, 1); ++} ++ ++static ssize_t cpu_apu_sppt_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute( ++ dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_APU_SPPT_POWERLIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_apu_sppt_powerlimit); ++ ++static ssize_t cpu_cross_loading_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_GET_CROSS_LOADING_POWERLIMIT, false, 1); ++} ++ ++static ssize_t cpu_cross_loading_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute( ++ dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_CPU_SET_CROSS_LOADING_POWERLIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RW(cpu_cross_loading_powerlimit); ++ ++static ssize_t gpu_oc_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_OC_STATUS, false, ++ 1); ++} ++ ++static ssize_t gpu_oc_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_SET_OC_STATUS, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(gpu_oc); ++ ++static ssize_t gpu_ppab_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_PPAB_POWERLIMIT, 16, 0, 1); ++} ++ ++static ssize_t gpu_ppab_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_SET_PPAB_POWERLIMIT, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(gpu_ppab_powerlimit); ++ ++static ssize_t gpu_ctgp_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT, 16, 0, 1); ++} ++ ++static ssize_t gpu_ctgp_powerlimit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_SET_CTGP_POWERLIMIT, ++ false, 1); ++} ++ ++static DEVICE_ATTR_RW(gpu_ctgp_powerlimit); ++ ++static ssize_t gpu_ctgp2_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute_from_buffer( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_CTGP_POWERLIMIT, 16, 0x0C, 1); ++} ++ ++static DEVICE_ATTR_RO(gpu_ctgp2_powerlimit); ++ ++// TOOD: probably remove again because provided by other means; only useful for overclocking ++static ssize_t ++gpu_default_ppab_ctrgp_powerlimit_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_DEFAULT_PPAB_CTGP_POWERLIMIT, false, 1); ++} ++static DEVICE_ATTR_RO(gpu_default_ppab_ctrgp_powerlimit); ++ ++static ssize_t gpu_temperature_limit_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ return show_simple_wmi_attribute( ++ dev, attr, buf, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_TEMPERATURE_LIMIT, false, 1); ++} ++ ++static ssize_t gpu_temperature_limit_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute( ++ dev, attr, buf, count, WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_SET_TEMPERATURE_LIMIT, false, 1); ++} ++ ++static DEVICE_ATTR_RW(gpu_temperature_limit); ++ ++// TOOD: probably remove again because provided by other means; only useful for overclocking ++static ssize_t gpu_boost_clock_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ WMI_GUID_LENOVO_GPU_METHOD, 0, ++ WMI_METHOD_ID_GPU_GET_BOOST_CLOCK, ++ false, 1); ++} ++static DEVICE_ATTR_RO(gpu_boost_clock); ++ ++static ssize_t fan_fullspeed_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ bool state = false; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_fanfullspeed(priv, &state); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ return sysfs_emit(buf, "%d\n", state); ++} ++ ++static ssize_t fan_fullspeed_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int state; ++ int err; ++ ++ err = kstrtouint(buf, 0, &state); ++ if (err) ++ return err; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = write_fanfullspeed(priv, state); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(fan_fullspeed); ++ ++static ssize_t fan_maxspeed_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return show_simple_wmi_attribute(dev, attr, buf, ++ WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_GET_MAXSPEED, false, ++ 1); ++} ++ ++static ssize_t fan_maxspeed_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ return store_simple_wmi_attribute(dev, attr, buf, count, ++ WMI_GUID_LENOVO_FAN_METHOD, 0, ++ WMI_METHOD_ID_FAN_SET_MAXSPEED, false, ++ 1); ++} ++ ++static DEVICE_ATTR_RW(fan_maxspeed); ++ ++static ssize_t powermode_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int power_mode; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ read_powermode(priv, &power_mode); ++ mutex_unlock(&priv->fancurve_mutex); ++ return sysfs_emit(buf, "%d\n", power_mode); ++} ++ ++static void legion_platform_profile_notify(void); ++ ++static ssize_t powermode_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int powermode; ++ int err; ++ ++ err = kstrtouint(buf, 0, &powermode); ++ if (err) ++ return err; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = write_powermode(priv, powermode); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ // TODO: better? ++ // we have to wait a bit before change is done in hardware and ++ // readback done after notifying returns correct value, otherwise ++ // the notified reader will read old value ++ msleep(500); ++ legion_platform_profile_notify(); ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(powermode); + +static struct attribute *legion_sysfs_attributes[] = { -+ &dev_attr_powermode.attr, &dev_attr_lockfancontroller.attr, -+ &dev_attr_keyboard_backlight.attr, NULL ++ &dev_attr_powermode.attr, ++ &dev_attr_lockfancontroller.attr, ++ &dev_attr_rapidcharge.attr, ++ &dev_attr_winkey.attr, ++ &dev_attr_touchpad.attr, ++ &dev_attr_gsync.attr, ++ &dev_attr_powerchargemode.attr, ++ &dev_attr_overdrive.attr, ++ &dev_attr_cpumaxfrequency.attr, ++ &dev_attr_isacfitforoc.attr, ++ &dev_attr_cpu_oc.attr, ++ &dev_attr_cpu_shortterm_powerlimit.attr, ++ &dev_attr_cpu_longterm_powerlimit.attr, ++ &dev_attr_cpu_apu_sppt_powerlimit.attr, ++ &dev_attr_cpu_default_powerlimit.attr, ++ &dev_attr_cpu_peak_powerlimit.attr, ++ &dev_attr_cpu_cross_loading_powerlimit.attr, ++ &dev_attr_gpu_oc.attr, ++ &dev_attr_gpu_ppab_powerlimit.attr, ++ &dev_attr_gpu_ctgp_powerlimit.attr, ++ &dev_attr_gpu_ctgp2_powerlimit.attr, ++ &dev_attr_gpu_default_ppab_ctrgp_powerlimit.attr, ++ &dev_attr_gpu_temperature_limit.attr, ++ &dev_attr_gpu_boost_clock.attr, ++ &dev_attr_fan_fullspeed.attr, ++ &dev_attr_fan_maxspeed.attr, ++ &dev_attr_thermalmode.attr, ++ &dev_attr_issupportcpuoc.attr, ++ &dev_attr_issupportgpuoc.attr, ++ &dev_attr_aslcodeversion.attr, ++ &dev_attr_igpumode.attr, ++ NULL +}; + +static const struct attribute_group legion_attribute_group = { @@ -10233,7 +13024,7 @@ index 000000000000..d1268d239cc5 + pr_info("Fan event: legion type: %d; acpi type: %d (%d=integer)", + wpriv->event, data->type, ACPI_TYPE_INTEGER); + // TODO: here it is too early (first unlock mutext, then wait a bit) -+ //platform_profile_notify(); ++ //legion_platform_profile_notify(); + break; + default: + pr_info("Event: legion type: %d; acpi type: %d (%d=integer)", @@ -10247,7 +13038,7 @@ index 000000000000..d1268d239cc5 + // problem: we get a event just before the powermode change (from the key?), + // so if we notify to early, it will read the old power mode/platform profile + msleep(500); -+ platform_profile_notify(); ++ legion_platform_profile_notify(); +} + +static int legion_wmi_probe(struct wmi_device *wdev, const void *context) @@ -10287,9 +13078,6 @@ index 000000000000..d1268d239cc5 + .event = LEGION_EVENT_F +}; + -+// check if really a method -+#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" -+ +#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F" +#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59" +#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \ @@ -10350,11 +13138,13 @@ index 000000000000..d1268d239cc5 +/* Platform profile */ +/* ============================ */ + -+enum LEGION_POWERMODE { -+ LEGION_POWERMODE_BALANCED = 0, -+ LEGION_POWERMODE_PERFORMANCE = 1, -+ LEGION_POWERMODE_QUIET = 2, -+}; ++static void legion_platform_profile_notify(void) ++{ ++ if (!enable_platformprofile) ++ pr_info("Skipping platform_profile_notify because enable_platformprofile is false\n"); ++ ++ platform_profile_notify(); ++} + +static int legion_platform_profile_get(struct platform_profile_handler *pprof, + enum platform_profile_option *profile) @@ -10364,18 +13154,21 @@ index 000000000000..d1268d239cc5 + + priv = container_of(pprof, struct legion_private, + platform_profile_handler); -+ powermode = read_powermode(&priv->ecram, priv->conf); ++ read_powermode(priv, &powermode); + + switch (powermode) { -+ case LEGION_POWERMODE_BALANCED: ++ case LEGION_WMI_POWERMODE_BALANCED: + *profile = PLATFORM_PROFILE_BALANCED; + break; -+ case LEGION_POWERMODE_PERFORMANCE: ++ case LEGION_WMI_POWERMODE_PERFORMANCE: + *profile = PLATFORM_PROFILE_PERFORMANCE; + break; -+ case LEGION_POWERMODE_QUIET: ++ case LEGION_WMI_POWERMODE_QUIET: + *profile = PLATFORM_PROFILE_QUIET; + break; ++ case LEGION_WMI_POWERMODE_CUSTOM: ++ *profile = PLATFORM_PROFILE_BALANCED_PERFORMANCE; ++ break; + default: + return -EINVAL; + } @@ -10393,25 +13186,33 @@ index 000000000000..d1268d239cc5 + + switch (profile) { + case PLATFORM_PROFILE_BALANCED: -+ powermode = LEGION_POWERMODE_BALANCED; ++ powermode = LEGION_WMI_POWERMODE_BALANCED; + break; + case PLATFORM_PROFILE_PERFORMANCE: -+ powermode = LEGION_POWERMODE_PERFORMANCE; ++ powermode = LEGION_WMI_POWERMODE_PERFORMANCE; + break; + case PLATFORM_PROFILE_QUIET: -+ powermode = LEGION_POWERMODE_QUIET; ++ powermode = LEGION_WMI_POWERMODE_QUIET; ++ break; ++ case PLATFORM_PROFILE_BALANCED_PERFORMANCE: ++ powermode = LEGION_WMI_POWERMODE_CUSTOM; + break; + default: + return -EOPNOTSUPP; + } + -+ return write_powermode(&priv->ecram, priv->conf, powermode); ++ return write_powermode(priv, powermode); +} + +static int legion_platform_profile_init(struct legion_private *priv) +{ + int err; + ++ if (!enable_platformprofile) { ++ pr_info("Skipping creating platform profile support because enable_platformprofile is false\n"); ++ return 0; ++ } ++ + priv->platform_profile_handler.profile_get = + legion_platform_profile_get; + priv->platform_profile_handler.profile_set = @@ -10422,6 +13223,11 @@ index 000000000000..d1268d239cc5 + priv->platform_profile_handler.choices); + set_bit(PLATFORM_PROFILE_PERFORMANCE, + priv->platform_profile_handler.choices); ++ if (priv->conf->has_custom_powermode && ++ priv->conf->access_method_powermode == ACCESS_METHOD_WMI) { ++ set_bit(PLATFORM_PROFILE_BALANCED_PERFORMANCE, ++ priv->platform_profile_handler.choices); ++ } + + err = platform_profile_register(&priv->platform_profile_handler); + if (err) @@ -10432,6 +13238,10 @@ index 000000000000..d1268d239cc5 + +static void legion_platform_profile_exit(struct legion_private *priv) +{ ++ if (!enable_platformprofile) { ++ pr_info("Skipping unloading platform profile support because enable_platformprofile is false\n"); ++ return; ++ } + pr_info("Unloading legion platform profile\n"); + platform_profile_remove(); + pr_info("Unloading legion platform profile done\n"); @@ -10489,35 +13299,44 @@ index 000000000000..d1268d239cc5 + int sensor_id = (to_sensor_dev_attr(devattr))->index; + struct sensor_values values; + int outval; -+ -+ read_sensor_values(&priv->ecram, priv->conf, &values); ++ int err = -EIO; + + switch (sensor_id) { + case SENSOR_CPU_TEMP_ID: -+ outval = 1000 * values.cpu_temp_celsius; ++ err = read_temperature(priv, 0, &outval); ++ outval *= 1000; + break; + case SENSOR_GPU_TEMP_ID: -+ outval = 1000 * values.gpu_temp_celsius; ++ err = read_temperature(priv, 1, &outval); ++ outval *= 1000; + break; + case SENSOR_IC_TEMP_ID: ++ ec_read_sensor_values(&priv->ecram, priv->conf, &values); + outval = 1000 * values.ic_temp_celsius; ++ err = 0; + break; + case SENSOR_FAN1_RPM_ID: -+ outval = values.fan1_rpm; ++ err = read_fanspeed(priv, 0, &outval); + break; + case SENSOR_FAN2_RPM_ID: -+ outval = values.fan2_rpm; ++ err = read_fanspeed(priv, 1, &outval); + break; + case SENSOR_FAN1_TARGET_RPM_ID: ++ ec_read_sensor_values(&priv->ecram, priv->conf, &values); + outval = values.fan1_target_rpm; ++ err = 0; + break; + case SENSOR_FAN2_TARGET_RPM_ID: ++ ec_read_sensor_values(&priv->ecram, priv->conf, &values); + outval = values.fan2_target_rpm; ++ err = 0; + break; + default: + pr_info("Error reading sensor value with id %d\n", sensor_id); + return -EOPNOTSUPP; + } ++ if (err) ++ return err; + + return sprintf(buf, "%d\n", outval); +} @@ -10562,7 +13381,7 @@ index 000000000000..d1268d239cc5 + int point_id = to_sensor_dev_attr_2(devattr)->index; + + mutex_lock(&priv->fancurve_mutex); -+ err = read_fancurve(&priv->ecram, priv->conf, &fancurve); ++ err = read_fancurve(priv, &fancurve); + mutex_unlock(&priv->fancurve_mutex); + + if (err) { @@ -10629,6 +13448,7 @@ index 000000000000..d1268d239cc5 + struct legion_private *priv = dev_get_drvdata(dev); + int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr; + int point_id = to_sensor_dev_attr_2(devattr)->index; ++ bool write_fancurve_size = false; + + if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) { + pr_info("Reading fancurve failed due to wrong point id: %d\n", @@ -10645,7 +13465,7 @@ index 000000000000..d1268d239cc5 + } + + mutex_lock(&priv->fancurve_mutex); -+ err = read_fancurve(&priv->ecram, priv->conf, &fancurve); ++ err = read_fancurve(priv, &fancurve); + + if (err) { + pr_info("Reading fancurve failed\n"); @@ -10686,6 +13506,7 @@ index 000000000000..d1268d239cc5 + break; + case FANCURVE_SIZE: + valid = fancurve_set_size(&fancurve, value, true); ++ write_fancurve_size = true; + break; + default: + pr_info("Writing fancurve failed due to wrong attribute id: %d\n", @@ -10701,7 +13522,7 @@ index 000000000000..d1268d239cc5 + goto error_mutex; + } + -+ err = write_fancurve(&priv->ecram, priv->conf, &fancurve, false); ++ err = write_fancurve(priv, &fancurve, write_fancurve_size); + if (err) { + pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n", + point_id); @@ -10939,7 +13760,7 @@ index 000000000000..d1268d239cc5 + struct legion_private *priv = dev_get_drvdata(dev); + + mutex_lock(&priv->fancurve_mutex); -+ err = read_minifancurve(&priv->ecram, priv->conf, &value); ++ err = ec_read_minifancurve(&priv->ecram, priv->conf, &value); + if (err) { + err = -1; + pr_info("Reading minifancurve not succesful\n"); @@ -10970,7 +13791,7 @@ index 000000000000..d1268d239cc5 + } + + mutex_lock(&priv->fancurve_mutex); -+ err = write_minifancurve(&priv->ecram, priv->conf, value); ++ err = ec_write_minifancurve(&priv->ecram, priv->conf, value); + if (err) { + err = -1; + pr_info("Writing minifancurve not succesful\n"); @@ -10995,7 +13816,7 @@ index 000000000000..d1268d239cc5 + struct legion_private *priv = dev_get_drvdata(dev); + + mutex_lock(&priv->fancurve_mutex); -+ err = read_maximumfanspeed(&priv->ecram, priv->conf, &value); ++ err = ec_read_fanfullspeed(&priv->ecram, priv->conf, &value); + if (err) { + err = -1; + pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n"); @@ -11009,6 +13830,7 @@ index 000000000000..d1268d239cc5 + return -1; +} + ++// TODO: remove? or use WMI method? +static ssize_t pwm1_mode_store(struct device *dev, + struct device_attribute *devattr, + const char *buf, size_t count) @@ -11028,7 +13850,7 @@ index 000000000000..d1268d239cc5 + is_maximumfanspeed = value == 0; + + mutex_lock(&priv->fancurve_mutex); -+ err = write_maximumfanspeed(&priv->ecram, priv->conf, ++ err = ec_write_fanfullspeed(&priv->ecram, priv->conf, + is_maximumfanspeed); + if (err) { + err = -1; @@ -11153,8 +13975,8 @@ index 000000000000..d1268d239cc5 + &sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL +}; + -+static umode_t legion_is_visible(struct kobject *kobj, struct attribute *attr, -+ int idx) ++static umode_t legion_hwmon_is_visible(struct kobject *kobj, ++ struct attribute *attr, int idx) +{ + bool supported = true; + struct device *dev = kobj_to_dev(kobj); @@ -11163,6 +13985,9 @@ index 000000000000..d1268d239cc5 + if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr) + supported = priv->conf->has_minifancurve; + ++ supported = supported && (priv->conf->access_method_fancurve != ++ ACCESS_METHOD_NO_ACCESS); ++ + return supported ? attr->mode : 0; +} + @@ -11173,14 +13998,14 @@ index 000000000000..d1268d239cc5 + +static const struct attribute_group legion_hwmon_fancurve_group = { + .attrs = fancurve_hwmon_attributes, -+ .is_visible = legion_is_visible, ++ .is_visible = legion_hwmon_is_visible, +}; + +static const struct attribute_group *legion_hwmon_groups[] = { + &legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL +}; + -+ssize_t legion_hwmon_init(struct legion_private *priv) ++static ssize_t legion_hwmon_init(struct legion_private *priv) +{ + //TODO: use hwmon_device_register_with_groups or + // hwmon_device_register_with_info (latter means all hwmon functions have to be @@ -11200,7 +14025,7 @@ index 000000000000..d1268d239cc5 + return 0; +} + -+void legion_hwmon_exit(struct legion_private *priv) ++static void legion_hwmon_exit(struct legion_private *priv) +{ + pr_info("Unloading legion hwon\n"); + if (priv->hwmon_dev) { @@ -11210,16 +14035,223 @@ index 000000000000..d1268d239cc5 + pr_info("Unloading legion hwon done\n"); +} + ++/* ACPI*/ ++ ++static int acpi_init(struct legion_private *priv, struct acpi_device *adev) ++{ ++ int err; ++ unsigned long cfg; ++ bool skip_acpi_sta_check; ++ struct device *dev = &priv->platform_device->dev; ++ ++ priv->adev = adev; ++ if (!priv->adev) { ++ dev_info(dev, "Could not get ACPI handle\n"); ++ goto err_acpi_init; ++ } ++ ++ skip_acpi_sta_check = force || (!priv->conf->acpi_check_dev); ++ if (!skip_acpi_sta_check) { ++ err = eval_int(priv->adev->handle, "_STA", &cfg); ++ if (err) { ++ dev_info(dev, "Could not evaluate ACPI _STA\n"); ++ goto err_acpi_init; ++ } ++ ++ err = eval_int(priv->adev->handle, "VPC0._CFG", &cfg); ++ if (err) { ++ dev_info(dev, "Could not evaluate ACPI _CFG\n"); ++ goto err_acpi_init; ++ } ++ dev_info(dev, "ACPI CFG: %lu\n", cfg); ++ } else { ++ dev_info(dev, "Skipping ACPI _STA check"); ++ } ++ ++ return 0; ++ ++err_acpi_init: ++ return err; ++} ++ ++/* ============================= */ ++/* White Keyboard Backlight */ ++/* ============================ */ ++// In style of ideapad-driver and with code modified from ideapad-driver. ++ ++static enum led_brightness ++legion_kbd_bl_led_cdev_brightness_get(struct led_classdev *led_cdev) ++{ ++ struct legion_private *priv = ++ container_of(led_cdev, struct legion_private, kbd_bl.led); ++ ++ return legion_kbd_bl_brightness_get(priv); ++} ++ ++static int legion_kbd_bl_led_cdev_brightness_set(struct led_classdev *led_cdev, ++ enum led_brightness brightness) ++{ ++ struct legion_private *priv = ++ container_of(led_cdev, struct legion_private, kbd_bl.led); ++ ++ return legion_kbd_bl_brightness_set(priv, brightness); ++} ++ ++static int legion_kbd_bl_init(struct legion_private *priv) ++{ ++ int brightness, err; ++ ++ if (WARN_ON(priv->kbd_bl.initialized)) { ++ pr_info("Keyboard backlight already initialized\n"); ++ return -EEXIST; ++ } ++ ++ if (priv->conf->access_method_keyboard == ACCESS_METHOD_NO_ACCESS) { ++ pr_info("Keyboard backlight handling disabled by this driver\n"); ++ return -ENODEV; ++ } ++ ++ brightness = legion_kbd_bl_brightness_get(priv); ++ if (brightness < 0) { ++ pr_info("Error reading keyboard brighntess\n"); ++ return brightness; ++ } ++ ++ priv->kbd_bl.last_brightness = brightness; ++ ++ // will be renamed to "platform::kbd_backlight_1" if it exists already ++ priv->kbd_bl.led.name = "platform::" LED_FUNCTION_KBD_BACKLIGHT; ++ priv->kbd_bl.led.max_brightness = 2; ++ priv->kbd_bl.led.brightness_get = legion_kbd_bl_led_cdev_brightness_get; ++ priv->kbd_bl.led.brightness_set_blocking = ++ legion_kbd_bl_led_cdev_brightness_set; ++ priv->kbd_bl.led.flags = LED_BRIGHT_HW_CHANGED; ++ ++ err = led_classdev_register(&priv->platform_device->dev, ++ &priv->kbd_bl.led); ++ if (err) ++ return err; ++ ++ priv->kbd_bl.initialized = true; ++ ++ return 0; ++} ++ ++/** ++ * Deinit keyboard backlight. ++ * ++ * Can also be called if init was not successful. ++ * ++ */ ++static void legion_kbd_bl_exit(struct legion_private *priv) ++{ ++ if (!priv->kbd_bl.initialized) ++ return; ++ ++ priv->kbd_bl.initialized = false; ++ ++ led_classdev_unregister(&priv->kbd_bl.led); ++} ++ ++/* ============================= */ ++/* Additional light driver */ ++/* ============================ */ ++ ++static enum led_brightness ++legion_wmi_cdev_brightness_get(struct led_classdev *led_cdev) ++{ ++ struct legion_private *priv = ++ container_of(led_cdev, struct legion_private, kbd_bl.led); ++ struct light *light_ins = container_of(led_cdev, struct light, led); ++ ++ return legion_wmi_light_get(priv, light_ins->light_id, ++ light_ins->lower_limit, ++ light_ins->upper_limit); ++} ++ ++static int legion_wmi_cdev_brightness_set(struct led_classdev *led_cdev, ++ enum led_brightness brightness) ++{ ++ struct legion_private *priv = ++ container_of(led_cdev, struct legion_private, kbd_bl.led); ++ struct light *light_ins = container_of(led_cdev, struct light, led); ++ ++ return legion_wmi_light_set(priv, light_ins->light_id, ++ light_ins->lower_limit, ++ light_ins->upper_limit, brightness); ++} ++ ++static int legion_light_init(struct legion_private *priv, ++ struct light *light_ins, u8 light_id, ++ u8 lower_limit, u8 upper_limit, const char *name) ++{ ++ int brightness, err; ++ ++ if (WARN_ON(light_ins->initialized)) { ++ pr_info("Light already initialized for light: %u\n", ++ light_ins->light_id); ++ return -EEXIST; ++ } ++ ++ light_ins->light_id = light_id; ++ light_ins->lower_limit = lower_limit; ++ light_ins->upper_limit = upper_limit; ++ ++ brightness = legion_wmi_light_get(priv, light_ins->light_id, ++ light_ins->lower_limit, ++ light_ins->upper_limit); ++ if (brightness < 0) { ++ pr_info("Error reading brighntess for light: %u\n", ++ light_ins->light_id); ++ return brightness; ++ } ++ ++ light_ins->led.name = name; ++ light_ins->led.max_brightness = ++ light_ins->upper_limit - light_ins->lower_limit; ++ light_ins->led.brightness_get = legion_wmi_cdev_brightness_get; ++ light_ins->led.brightness_set_blocking = legion_wmi_cdev_brightness_set; ++ light_ins->led.flags = LED_BRIGHT_HW_CHANGED; ++ ++ err = led_classdev_register(&priv->platform_device->dev, ++ &light_ins->led); ++ if (err) ++ return err; ++ ++ light_ins->initialized = true; ++ ++ return 0; ++} ++ ++/** ++ * Deinit light. ++ * ++ * Can also be called if init was not successful. ++ * ++ */ ++static void legion_light_exit(struct legion_private *priv, ++ struct light *light_ins) ++{ ++ if (!light_ins->initialized) ++ return; ++ ++ light_ins->initialized = false; ++ ++ led_classdev_unregister(&light_ins->led); ++} ++ +/* ============================= */ +/* Platform driver */ +/* ============================ */ + -+int legion_add(struct platform_device *pdev) ++static int legion_add(struct platform_device *pdev) +{ + struct legion_private *priv; + const struct dmi_system_id *dmi_sys; + int err; + u16 ec_read_id; ++ bool skip_ec_id_check; ++ bool is_ec_id_valid; + bool is_denied = true; + bool is_allowed = false; + bool do_load_by_list = false; @@ -11227,7 +14259,9 @@ index 000000000000..d1268d239cc5 + //struct legion_private *priv = dev_get_drvdata(&pdev->dev); + dev_info(&pdev->dev, "legion_laptop platform driver probing\n"); + -+ dev_info(&pdev->dev, "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n", ++ dev_info( ++ &pdev->dev, ++ "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n", + dmi_get_system_info(DMI_SYS_VENDOR), + dmi_get_system_info(DMI_PRODUCT_NAME), + dmi_get_system_info(DMI_BIOS_VERSION)); @@ -11284,24 +14318,46 @@ index 000000000000..d1268d239cc5 + + priv->conf = dmi_sys->driver_data; + ++ err = acpi_init(priv, ACPI_COMPANION(&pdev->dev)); ++ if (err) { ++ dev_info(&pdev->dev, "Could not init ACPI access: %d\n", err); ++ goto err_acpi_init; ++ } ++ ++ // TODO: remove; only used for reverse engineering ++ pr_info("Creating RAM access to embedded controller\n"); ++ err = ecram_memoryio_init(&priv->ec_memoryio, ++ priv->conf->ramio_physical_start, 0, ++ priv->conf->ramio_size); ++ if (err) { ++ dev_info( ++ &pdev->dev, ++ "Could not init RAM access to embedded controller: %d\n", ++ err); ++ goto err_ecram_memoryio_init; ++ } ++ + err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start, + priv->conf->memoryio_size); + if (err) { + dev_info(&pdev->dev, -+ "Could not init access to embedded controller\n"); ++ "Could not init access to embedded controller: %d\n", ++ err); + goto err_ecram_init; + } + + ec_read_id = read_ec_id(&priv->ecram, priv->conf); + dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id); -+ if (priv->conf->check_embedded_controller_id && -+ !(ec_read_id == priv->conf->embedded_controller_id)) { ++ skip_ec_id_check = force || (!priv->conf->check_embedded_controller_id); ++ is_ec_id_valid = skip_ec_id_check || ++ (ec_read_id == priv->conf->embedded_controller_id); ++ if (!is_ec_id_valid) { + err = -ENOMEM; + dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n", + priv->conf->embedded_controller_id, ec_read_id); + goto err_ecram_id; + } -+ if (!priv->conf->check_embedded_controller_id) { ++ if (skip_ec_id_check) { + dev_info(&pdev->dev, + "Skipped checking embedded controller id\n"); + } @@ -11312,33 +14368,65 @@ index 000000000000..d1268d239cc5 + pr_info("Creating sysfs inteface\n"); + err = legion_sysfs_init(priv); + if (err) { -+ dev_info(&pdev->dev, "Creating sysfs interface failed\n"); ++ dev_info(&pdev->dev, "Creating sysfs interface failed: %d\n", ++ err); + goto err_sysfs_init; + } + + pr_info("Creating hwmon interface"); + err = legion_hwmon_init(priv); -+ if (err) ++ if (err) { ++ dev_info(&pdev->dev, "Creating hwmon interface failed: %d\n", ++ err); + goto err_hwmon_init; ++ } + + pr_info("Creating platform profile support\n"); + err = legion_platform_profile_init(priv); + if (err) { -+ dev_info(&pdev->dev, "Creating platform profile failed\n"); ++ dev_info(&pdev->dev, "Creating platform profile failed: %d\n", ++ err); + goto err_platform_profile; + } + + pr_info("Init WMI driver support\n"); + err = legion_wmi_init(); + if (err) { -+ dev_info(&pdev->dev, "Init WMI driver failed\n"); ++ dev_info(&pdev->dev, "Init WMI driver failed: %d\n", err); + goto err_wmi; + } + ++ pr_info("Init keyboard backlight LED driver\n"); ++ err = legion_kbd_bl_init(priv); ++ if (err) { ++ dev_info( ++ &pdev->dev, ++ "Init keyboard backlight LED driver failed. Skipping ...\n"); ++ } ++ ++ pr_info("Init Y-Logo LED driver\n"); ++ err = legion_light_init(priv, &priv->ylogo_light, LIGHT_ID_YLOGO, 0, 1, ++ "platform::ylogo"); ++ if (err) { ++ dev_info(&pdev->dev, ++ "Init Y-Logo LED driver failed. Skipping ...\n"); ++ } ++ ++ pr_info("Init IO-Port LED driver\n"); ++ err = legion_light_init(priv, &priv->iport_light, LIGHT_ID_IOPORT, 1, 2, ++ "platform::ioport"); ++ if (err) { ++ dev_info(&pdev->dev, ++ "Init IO-Port LED driver failed. Skipping ...\n"); ++ } ++ + dev_info(&pdev->dev, "legion_laptop loaded for this device\n"); + return 0; + + // TODO: remove eventually ++ legion_light_exit(priv, &priv->iport_light); ++ legion_light_exit(priv, &priv->ylogo_light); ++ legion_kbd_bl_exit(priv); + legion_wmi_exit(); +err_wmi: + legion_platform_profile_exit(priv); @@ -11351,6 +14439,9 @@ index 000000000000..d1268d239cc5 +err_ecram_id: + ecram_exit(&priv->ecram); +err_ecram_init: ++ ecram_memoryio_exit(&priv->ec_memoryio); ++err_ecram_memoryio_init: ++err_acpi_init: + legion_shared_exit(priv); +err_legion_shared_init: +err_model_mismtach: @@ -11358,7 +14449,7 @@ index 000000000000..d1268d239cc5 + return err; +} + -+int legion_remove(struct platform_device *pdev) ++static int legion_remove(struct platform_device *pdev) +{ + struct legion_private *priv = dev_get_drvdata(&pdev->dev); + @@ -11366,6 +14457,9 @@ index 000000000000..d1268d239cc5 + priv->loaded = false; + mutex_unlock(&legion_shared_mutex); + ++ legion_light_exit(priv, &priv->iport_light); ++ legion_light_exit(priv, &priv->ylogo_light); ++ legion_kbd_bl_exit(priv); + // first unregister wmi, so toggling powermode does not + // generate events anymore that even might be delayed + legion_wmi_exit(); @@ -11373,19 +14467,20 @@ index 000000000000..d1268d239cc5 + + // toggle power mode to load default setting from embedded controller + // again -+ toggle_powermode(&priv->ecram, priv->conf); ++ toggle_powermode(priv); + + legion_hwmon_exit(priv); + legion_sysfs_exit(priv); + legion_debugfs_exit(priv); + ecram_exit(&priv->ecram); ++ ecram_memoryio_exit(&priv->ec_memoryio); + legion_shared_exit(priv); + + pr_info("Legion platform unloaded\n"); + return 0; +} + -+int legion_resume(struct platform_device *pdev) ++static int legion_resume(struct platform_device *pdev) +{ + //struct legion_private *priv = dev_get_drvdata(&pdev->dev); + dev_info(&pdev->dev, "Resumed in legion-laptop\n"); @@ -11406,7 +14501,8 @@ index 000000000000..d1268d239cc5 + +// same as ideapad +static const struct acpi_device_id legion_device_ids[] = { -+ { "PNP0C09", 0 }, // todo: change to "VPC2004" ++ // todo: change to "VPC2004", and also ACPI paths ++ { "PNP0C09", 0 }, + { "", 0 }, +}; +MODULE_DEVICE_TABLE(acpi, legion_device_ids); @@ -11422,7 +14518,7 @@ index 000000000000..d1268d239cc5 + }, +}; + -+int __init legion_init(void) ++static int __init legion_init(void) +{ + int err; + @@ -11438,7 +14534,7 @@ index 000000000000..d1268d239cc5 + +module_init(legion_init); + -+void __exit legion_exit(void) ++static void __exit legion_exit(void) +{ + platform_driver_unregister(&legion_driver); + pr_info("legion_laptop exit\n"); @@ -11447,7 +14543,7 @@ index 000000000000..d1268d239cc5 +module_exit(legion_exit); diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c new file mode 100644 -index 000000000000..77a6677ec19e +index 0000000000000..77a6677ec19e6 --- /dev/null +++ b/drivers/platform/x86/steamdeck.c @@ -0,0 +1,523 @@ @@ -11975,7 +15071,7 @@ index 000000000000..77a6677ec19e +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mm.h b/include/linux/mm.h -index 34f9dba17c1a..4527f319019a 100644 +index 34f9dba17c1a7..4527f319019aa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) @@ -11988,7 +15084,7 @@ index 34f9dba17c1a..4527f319019a 100644 extern int sysctl_max_map_count; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 716953ee1ebd..dace360dc38d 100644 +index 716953ee1ebdb..dace360dc38d7 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1181,7 +1181,7 @@ struct readahead_control { @@ -12001,7 +15097,7 @@ index 716953ee1ebd..dace360dc38d 100644 void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h -index 45f09bec02c4..87b20e2ee274 100644 +index 45f09bec02c48..87b20e2ee2744 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -148,6 +148,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, @@ -12023,7 +15119,7 @@ index 45f09bec02c4..87b20e2ee274 100644 { return &init_user_ns; diff --git a/init/Kconfig b/init/Kconfig -index f7f65af4ee12..71755cc8ed3e 100644 +index 5e7d4885d1bf8..25193a9d5c617 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -123,6 +123,10 @@ config THREAD_INFO_IN_TASK @@ -12037,7 +15133,7 @@ index f7f65af4ee12..71755cc8ed3e 100644 config BROKEN bool -@@ -1225,6 +1229,22 @@ config USER_NS +@@ -1226,6 +1230,22 @@ config USER_NS If unsure, say N. @@ -12060,7 +15156,7 @@ index f7f65af4ee12..71755cc8ed3e 100644 config PID_NS bool "PID Namespaces" default y -@@ -1367,6 +1387,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE +@@ -1368,6 +1388,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. @@ -12074,7 +15170,7 @@ index f7f65af4ee12..71755cc8ed3e 100644 bool "Optimize for size (-Os)" help diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888e..0f78364efd4f 100644 +index 38ef6d06888ef..0f78364efd4f2 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -40,6 +40,27 @@ choice @@ -12116,7 +15212,7 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index d2e12b6d2b18..95ca80492a37 100644 +index f81149739eb9f..36fb0b711541d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ @@ -12130,7 +15226,7 @@ index d2e12b6d2b18..95ca80492a37 100644 #include #include #include -@@ -2263,6 +2267,10 @@ __latent_entropy struct task_struct *copy_process( +@@ -2271,6 +2275,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -12141,7 +15237,7 @@ index d2e12b6d2b18..95ca80492a37 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3416,6 +3424,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3424,6 +3432,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -12155,7 +15251,7 @@ index d2e12b6d2b18..95ca80492a37 100644 if (err) goto bad_unshare_out; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index b3e25be58e2b..2c335df30171 100644 +index 1d9c2482c5a35..ff33866916263 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -69,9 +69,13 @@ @@ -12215,7 +15311,7 @@ index b3e25be58e2b..2c335df30171 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 354a2d294f52..4dc780aa3bcc 100644 +index 354a2d294f526..4dc780aa3bcc8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); @@ -12245,7 +15341,7 @@ index 354a2d294f52..4dc780aa3bcc 100644 { .procname = "tainted", diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 1d8e47bed3f1..fec01d016a35 100644 +index 1d8e47bed3f11..fec01d016a351 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ @@ -12262,8 +15358,46 @@ index 1d8e47bed3f1..fec01d016a35 100644 static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index c65566b4dc662..d3c8aaa68c5d3 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -150,31 +150,12 @@ EXPORT_SYMBOL(sg_init_one); + */ + static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) + { +- if (nents == SG_MAX_SINGLE_ALLOC) { +- /* +- * Kmemleak doesn't track page allocations as they are not +- * commonly used (in a raw form) for kernel data structures. +- * As we chain together a list of pages and then a normal +- * kmalloc (tracked by kmemleak), in order to for that last +- * allocation not to become decoupled (and thus a +- * false-positive) we need to inform kmemleak of all the +- * intermediate allocations. +- */ +- void *ptr = (void *) __get_free_page(gfp_mask); +- kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); +- return ptr; +- } else +- return kmalloc_array(nents, sizeof(struct scatterlist), +- gfp_mask); ++ return kmalloc_array(nents, sizeof(struct scatterlist), gfp_mask); + } + + static void sg_kfree(struct scatterlist *sg, unsigned int nents) + { +- if (nents == SG_MAX_SINGLE_ALLOC) { +- kmemleak_free(sg); +- free_page((unsigned long) sg); +- } else +- kfree(sg); ++ kfree(sg); + } + + /** diff --git a/mm/Kconfig b/mm/Kconfig -index 09130434e30d..f772ba88df87 100644 +index 09130434e30d3..f772ba88df878 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -631,7 +631,7 @@ config COMPACTION @@ -12275,8 +15409,42 @@ index 09130434e30d..f772ba88df87 100644 default 1 # +diff --git a/mm/internal.h b/mm/internal.h +index 8ed127c1c808c..2f3040ec707d7 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -421,6 +421,7 @@ extern void prep_compound_page(struct page *page, unsigned int order); + extern void post_alloc_hook(struct page *page, unsigned int order, + gfp_t gfp_flags); + extern int user_min_free_kbytes; ++extern atomic_long_t kswapd_waiters; + + extern void free_unref_page(struct page *page, unsigned int order); + extern void free_unref_page_list(struct list_head *list); +diff --git a/mm/list_lru.c b/mm/list_lru.c +index a05e5bef3b400..0ead8e6651df0 100644 +--- a/mm/list_lru.c ++++ b/mm/list_lru.c +@@ -178,6 +178,7 @@ EXPORT_SYMBOL_GPL(list_lru_isolate_move); + unsigned long list_lru_count_one(struct list_lru *lru, + int nid, struct mem_cgroup *memcg) + { ++#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) + struct list_lru_one *l; + long count; + +@@ -190,6 +191,9 @@ unsigned long list_lru_count_one(struct list_lru *lru, + count = 0; + + return count; ++#else ++ return READ_ONCE(lru->node[nid].lru.nr_items); ++#endif + } + EXPORT_SYMBOL_GPL(list_lru_count_one); + diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index d3f42009bb70..39b9fd060630 100644 +index d3f42009bb702..39b9fd0606304 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -71,7 +71,11 @@ static long ratelimit_pages = 32; @@ -12303,8 +15471,118 @@ index d3f42009bb70..39b9fd060630 100644 EXPORT_SYMBOL_GPL(dirty_writeback_interval); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 7d3460c7a480b..bd2a12f4e04de 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -204,6 +204,8 @@ EXPORT_SYMBOL(node_states); + + gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; + ++atomic_long_t kswapd_waiters = ATOMIC_LONG_INIT(0); ++ + /* + * A cached value of the page's pageblock's migratetype, used when the page is + * put on a pcplist. Used to avoid the pageblock migratetype lookup when +@@ -297,7 +299,7 @@ static compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { + + int min_free_kbytes = 1024; + int user_min_free_kbytes = -1; +-static int watermark_boost_factor __read_mostly = 15000; ++static int watermark_boost_factor __read_mostly; + static int watermark_scale_factor = 10; + + /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ +@@ -2152,16 +2154,17 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, + } + + /* +- * Obtain a specified number of elements from the buddy allocator, all under +- * a single hold of the lock, for efficiency. Add them to the supplied list. +- * Returns the number of new pages which were placed at *list. ++ * Obtain a specified number of elements from the buddy allocator, and relax the ++ * zone lock when needed. Add them to the supplied list. Returns the number of ++ * new pages which were placed at *list. + */ + static int rmqueue_bulk(struct zone *zone, unsigned int order, + unsigned long count, struct list_head *list, + int migratetype, unsigned int alloc_flags) + { ++ const bool can_resched = !preempt_count() && !irqs_disabled(); + unsigned long flags; +- int i; ++ int i, last_mod = 0; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { +@@ -2170,6 +2173,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + if (unlikely(page == NULL)) + break; + ++ /* Reschedule and ease the contention on the lock if needed */ ++ if (i + 1 < count && ((can_resched && need_resched()) || ++ spin_needbreak(&zone->lock))) { ++ __mod_zone_page_state(zone, NR_FREE_PAGES, ++ -((i + 1 - last_mod) << order)); ++ last_mod = i + 1; ++ spin_unlock_irqrestore(&zone->lock, flags); ++ if (can_resched) ++ cond_resched(); ++ spin_lock_irqsave(&zone->lock, flags); ++ } ++ + /* + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of +@@ -2186,7 +2201,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + -(1 << order)); + } + +- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); ++ __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); + spin_unlock_irqrestore(&zone->lock, flags); + + return i; +@@ -3962,6 +3977,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + unsigned int cpuset_mems_cookie; + unsigned int zonelist_iter_cookie; + int reserve_flags; ++ bool woke_kswapd = false; + + restart: + compaction_retries = 0; +@@ -4001,8 +4017,13 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + goto nopage; + } + +- if (alloc_flags & ALLOC_KSWAPD) ++ if (alloc_flags & ALLOC_KSWAPD) { ++ if (!woke_kswapd) { ++ atomic_long_inc(&kswapd_waiters); ++ woke_kswapd = true; ++ } + wake_all_kswapds(order, gfp_mask, ac); ++ } + + /* + * The adjusted alloc_flags might result in immediate success, so try +@@ -4217,9 +4238,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, + goto retry; + } + fail: +- warn_alloc(gfp_mask, ac->nodemask, +- "page allocation failure: order:%u", order); + got_pg: ++ if (woke_kswapd) ++ atomic_long_dec(&kswapd_waiters); ++ if (!page) ++ warn_alloc(gfp_mask, ac->nodemask, ++ "page allocation failure: order:%u", order); + return page; + } + diff --git a/mm/swap.c b/mm/swap.c -index cd8f0150ba3a..42c405a4f114 100644 +index cd8f0150ba3aa..42c405a4f114c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) @@ -12325,7 +15603,7 @@ index cd8f0150ba3a..42c405a4f114 100644 +#endif } diff --git a/mm/vmpressure.c b/mm/vmpressure.c -index b52644771cc4..11a4b0e3b583 100644 +index 22c6689d93027..bf65bd9abdf34 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; @@ -12341,7 +15619,7 @@ index b52644771cc4..11a4b0e3b583 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 2fe4a11d63f4..445ce9324b01 100644 +index da152407bc2b1..a958dc4e00245 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -186,7 +186,11 @@ struct scan_control { @@ -12356,7 +15634,7 @@ index 2fe4a11d63f4..445ce9324b01 100644 LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); -@@ -4594,7 +4598,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -4595,7 +4599,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -12368,12 +15646,76 @@ index 2fe4a11d63f4..445ce9324b01 100644 static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { +@@ -6908,7 +6916,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, + return 0; + } + +-static bool allow_direct_reclaim(pg_data_t *pgdat) ++static bool allow_direct_reclaim(pg_data_t *pgdat, bool using_kswapd) + { + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; +@@ -6937,6 +6945,10 @@ static bool allow_direct_reclaim(pg_data_t *pgdat) + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + ++ /* The throttled direct reclaimer is now a kswapd waiter */ ++ if (unlikely(!using_kswapd && !wmark_ok)) ++ atomic_long_inc(&kswapd_waiters); ++ + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL) +@@ -7002,7 +7014,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; +- if (allow_direct_reclaim(pgdat)) ++ if (allow_direct_reclaim(pgdat, gfp_mask & __GFP_KSWAPD_RECLAIM)) + goto out; + break; + } +@@ -7024,11 +7036,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + */ + if (!(gfp_mask & __GFP_FS)) + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, +- allow_direct_reclaim(pgdat), HZ); ++ allow_direct_reclaim(pgdat, true), HZ); + else + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, +- allow_direct_reclaim(pgdat)); ++ allow_direct_reclaim(pgdat, true)); ++ ++ if (unlikely(!(gfp_mask & __GFP_KSWAPD_RECLAIM))) ++ atomic_long_dec(&kswapd_waiters); + + if (fatal_signal_pending(current)) + return true; +@@ -7526,14 +7541,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) + * able to safely make forward progress. Wake them + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && +- allow_direct_reclaim(pgdat)) ++ allow_direct_reclaim(pgdat, true)) + wake_up_all(&pgdat->pfmemalloc_wait); + + /* Check if kswapd should be suspending */ + __fs_reclaim_release(_THIS_IP_); + ret = try_to_freeze(); + __fs_reclaim_acquire(_THIS_IP_); +- if (ret || kthread_should_stop()) ++ if (ret || kthread_should_stop() || ++ !atomic_long_read(&kswapd_waiters)) + break; + + /* -- 2.42.0 -From b05442522d6f62443d6bbd57d68868d96910ee2e Mon Sep 17 00:00:00 2001 +From 2ca2b1765d7ca43358ccdd33d4f1c5c29a7b507f Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 28 Aug 2023 14:02:22 +0200 +Date: Sat, 23 Sep 2023 13:08:08 +0200 Subject: [PATCH 5/7] fixes Signed-off-by: Peter Jung @@ -12382,25 +15724,29 @@ Signed-off-by: Peter Jung .../testing/sysfs-class-led-trigger-blkdev | 78 ++ Documentation/leds/index.rst | 1 + Documentation/leds/ledtrig-blkdev.rst | 158 +++ - block/mq-deadline.c | 3 +- drivers/bluetooth/btusb.c | 2 +- - drivers/char/tpm/tpm_crb.c | 33 +- + drivers/gpu/drm/i915/gem/i915_gem_shmem.c | 11 +- drivers/leds/trigger/Kconfig | 9 + drivers/leds/trigger/Makefile | 1 + drivers/leds/trigger/ledtrig-blkdev.c | 1218 +++++++++++++++++ - drivers/pinctrl/pinctrl-amd.c | 4 +- + .../net/wireless/mediatek/mt76/mt7921/init.c | 9 +- + fs/btrfs/extent-tree.c | 61 +- + fs/btrfs/extent-tree.h | 13 +- + fs/btrfs/inode.c | 29 +- include/linux/pageblock-flags.h | 2 +- kernel/padata.c | 4 +- - mm/readahead.c | 10 +- + mm/slab_common.c | 12 +- scripts/Makefile.vmlinux_o | 2 +- sound/pci/hda/cs35l41_hda.c | 2 +- - 16 files changed, 1502 insertions(+), 35 deletions(-) + sound/pci/hda/patch_realtek.c | 1 + + .../intel/common/soc-acpi-intel-adl-match.c | 12 +- + 20 files changed, 1548 insertions(+), 87 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block -index c57e5b7cb532..2d1df6c9b463 100644 +index c57e5b7cb5326..2d1df6c9b4635 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: @@ -12422,7 +15768,7 @@ index c57e5b7cb532..2d1df6c9b463 100644 Contact: Martin K. Petersen diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev new file mode 100644 -index 000000000000..28ce8c814fb7 +index 0000000000000..28ce8c814fb76 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev @@ -0,0 +1,78 @@ @@ -12505,7 +15851,7 @@ index 000000000000..28ce8c814fb7 + may not match the device special file paths written to + link_device and unlink_device.) diff --git a/Documentation/leds/index.rst b/Documentation/leds/index.rst -index 3ade16c18328..3fd55a2cbfb5 100644 +index 3ade16c18328a..3fd55a2cbfb5f 100644 --- a/Documentation/leds/index.rst +++ b/Documentation/leds/index.rst @@ -10,6 +10,7 @@ LEDs @@ -12518,7 +15864,7 @@ index 3ade16c18328..3fd55a2cbfb5 100644 ledtrig-usbport diff --git a/Documentation/leds/ledtrig-blkdev.rst b/Documentation/leds/ledtrig-blkdev.rst new file mode 100644 -index 000000000000..9ff5b99de451 +index 0000000000000..9ff5b99de4514 --- /dev/null +++ b/Documentation/leds/ledtrig-blkdev.rst @@ -0,0 +1,158 @@ @@ -12680,26 +16026,11 @@ index 000000000000..9ff5b99de451 +* The ``blkdev`` LED trigger supports many-to-many device/LED associations. + A device can be associated with multiple LEDs, and an LED can be associated + with multiple devices. -diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index 02a916ba62ee..f958e79277b8 100644 ---- a/block/mq-deadline.c -+++ b/block/mq-deadline.c -@@ -646,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - struct blk_mq_tags *tags = hctx->sched_tags; -+ unsigned int shift = tags->bitmap_tags.sb.shift; - -- dd->async_depth = max(1UL, 3 * q->nr_requests / 4); -+ dd->async_depth = max(1U, 3 * (1U << shift) / 4); - - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); - } diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 764d176e9735..deb10b89fa51 100644 +index dfdfb72d350fe..daabd554ef37f 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c -@@ -945,7 +945,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) +@@ -960,7 +960,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) } gpiod_set_value_cansleep(reset_gpio, 0); @@ -12708,59 +16039,47 @@ index 764d176e9735..deb10b89fa51 100644 gpiod_set_value_cansleep(reset_gpio, 1); return; -diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c -index 9eb1a1859012..a5dbebb1acfc 100644 ---- a/drivers/char/tpm/tpm_crb.c -+++ b/drivers/char/tpm/tpm_crb.c -@@ -463,28 +463,6 @@ static bool crb_req_canceled(struct tpm_chip *chip, u8 status) - return (cancel & CRB_CANCEL_INVOKE) == CRB_CANCEL_INVOKE; - } +diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +index 8f1633c3fb935..73a4a4eb29e08 100644 +--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c ++++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c +@@ -100,6 +100,7 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, + st->nents = 0; + for (i = 0; i < page_count; i++) { + struct folio *folio; ++ unsigned long nr_pages; + const unsigned int shrink[] = { + I915_SHRINK_BOUND | I915_SHRINK_UNBOUND, + 0, +@@ -150,6 +151,8 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, + } + } while (1); --static int crb_check_flags(struct tpm_chip *chip) --{ -- u32 val; -- int ret; -- -- ret = crb_request_locality(chip, 0); -- if (ret) -- return ret; -- -- ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val, NULL); -- if (ret) -- goto release; -- -- if (val == 0x414D4400U /* AMD */) -- chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; -- --release: -- crb_relinquish_locality(chip, 0); -- -- return ret; --} -- - static const struct tpm_class_ops tpm_crb = { - .flags = TPM_OPS_AUTO_STARTUP, - .status = crb_status, -@@ -826,9 +804,14 @@ static int crb_acpi_add(struct acpi_device *device) - if (rc) - goto out; ++ nr_pages = min_t(unsigned long, ++ folio_nr_pages(folio), page_count - i); + if (!i || + sg->length >= max_segment || + folio_pfn(folio) != next_pfn) { +@@ -157,13 +160,13 @@ int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, + sg = sg_next(sg); -- rc = crb_check_flags(chip); -- if (rc) -- goto out; -+#ifdef CONFIG_X86 -+ /* A quirk for https://www.amd.com/en/support/kb/faq/pa-410 */ -+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && -+ priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { -+ dev_info(dev, "Disabling hwrng\n"); -+ chip->flags |= TPM_CHIP_FLAG_HWRNG_DISABLED; -+ } -+#endif /* CONFIG_X86 */ - - rc = tpm_chip_register(chip); + st->nents++; +- sg_set_folio(sg, folio, folio_size(folio), 0); ++ sg_set_folio(sg, folio, nr_pages * PAGE_SIZE, 0); + } else { + /* XXX: could overflow? */ +- sg->length += folio_size(folio); ++ sg->length += nr_pages * PAGE_SIZE; + } +- next_pfn = folio_pfn(folio) + folio_nr_pages(folio); +- i += folio_nr_pages(folio) - 1; ++ next_pfn = folio_pfn(folio) + nr_pages; ++ i += nr_pages - 1; + /* Check that the i965g/gm workaround works. */ + GEM_BUG_ON(gfp & __GFP_DMA32 && next_pfn >= 0x00100000UL); diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig -index 2a57328eca20..05e80cfd0ed8 100644 +index 2a57328eca207..05e80cfd0ed8d 100644 --- a/drivers/leds/trigger/Kconfig +++ b/drivers/leds/trigger/Kconfig @@ -155,4 +155,13 @@ config LEDS_TRIGGER_TTY @@ -12778,7 +16097,7 @@ index 2a57328eca20..05e80cfd0ed8 100644 + endif # LEDS_TRIGGERS diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile -index 25c4db97cdd4..d53bab5d93f1 100644 +index 25c4db97cdd4c..d53bab5d93f1c 100644 --- a/drivers/leds/trigger/Makefile +++ b/drivers/leds/trigger/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_LEDS_TRIGGER_NETDEV) += ledtrig-netdev.o @@ -12788,7 +16107,7 @@ index 25c4db97cdd4..d53bab5d93f1 100644 +obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c new file mode 100644 -index 000000000000..9e0c4b66ea27 +index 0000000000000..9e0c4b66ea27d --- /dev/null +++ b/drivers/leds/trigger/ledtrig-blkdev.c @@ -0,0 +1,1218 @@ @@ -14010,30 +17329,253 @@ index 000000000000..9e0c4b66ea27 +MODULE_DESCRIPTION("Block device LED trigger"); +MODULE_AUTHOR("Ian Pilcher "); +MODULE_LICENSE("GPL v2"); -diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c -index 4dff656af3ad..74241b2ff21e 100644 ---- a/drivers/pinctrl/pinctrl-amd.c -+++ b/drivers/pinctrl/pinctrl-amd.c -@@ -748,7 +748,7 @@ static int amd_pinconf_get(struct pinctrl_dev *pctldev, - break; - - default: -- dev_err(&gpio_dev->pdev->dev, "Invalid config param %04x\n", -+ dev_dbg(&gpio_dev->pdev->dev, "Invalid config param %04x\n", - param); - return -ENOTSUPP; +diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/init.c b/drivers/net/wireless/mediatek/mt76/mt7921/init.c +index f41975e37d06a..8d8f3dea3450e 100644 +--- a/drivers/net/wireless/mediatek/mt76/mt7921/init.c ++++ b/drivers/net/wireless/mediatek/mt76/mt7921/init.c +@@ -99,7 +99,8 @@ mt7921_init_wiphy(struct ieee80211_hw *hw) + wiphy->n_iface_combinations = ARRAY_SIZE(if_comb); } -@@ -798,7 +798,7 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, - break; + wiphy->flags &= ~(WIPHY_FLAG_IBSS_RSN | WIPHY_FLAG_4ADDR_AP | +- WIPHY_FLAG_4ADDR_STATION); ++ WIPHY_FLAG_4ADDR_STATION | ++ WIPHY_FLAG_PS_ON_BY_DEFAULT); + wiphy->interface_modes = BIT(NL80211_IFTYPE_STATION) | + BIT(NL80211_IFTYPE_AP) | + BIT(NL80211_IFTYPE_P2P_CLIENT) | +@@ -409,12 +410,6 @@ int mt7921_register_device(struct mt7921_dev *dev) + dev->pm.idle_timeout = MT7921_PM_TIMEOUT; + dev->pm.stats.last_wake_event = jiffies; + dev->pm.stats.last_doze_event = jiffies; +- if (!mt76_is_usb(&dev->mt76)) { +- dev->pm.enable_user = true; +- dev->pm.enable = true; +- dev->pm.ds_enable_user = true; +- dev->pm.ds_enable = true; +- } - default: -- dev_err(&gpio_dev->pdev->dev, -+ dev_dbg(&gpio_dev->pdev->dev, - "Invalid config param %04x\n", param); - ret = -ENOTSUPP; + if (!mt76_is_mmio(&dev->mt76)) + hw->extra_tx_headroom += MT_SDIO_TXD_SIZE + MT_SDIO_HDR_SIZE; +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 0917c5f39e3d0..c8a598c3e11bd 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3481,7 +3481,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, + * Helper function for find_free_extent(). + * + * Return -ENOENT to inform caller that we need fallback to unclustered mode. +- * Return -EAGAIN to inform caller that we need to re-search this block group + * Return >0 to inform caller that we find nothing + * Return 0 means we have found a location and set ffe_ctl->found_offset. + */ +@@ -3562,14 +3561,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); + return 0; } +- } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && +- !ffe_ctl->retry_clustered) { +- spin_unlock(&last_ptr->refill_lock); +- +- ffe_ctl->retry_clustered = true; +- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +- ffe_ctl->empty_cluster + ffe_ctl->empty_size); +- return -EAGAIN; + } + /* + * At this point we either didn't find a cluster or we weren't able to +@@ -3584,7 +3575,6 @@ static int find_free_extent_clustered(struct btrfs_block_group *bg, + /* + * Return >0 to inform caller that we find nothing + * Return 0 when we found an free extent and set ffe_ctrl->found_offset +- * Return -EAGAIN to inform caller that we need to re-search this block group + */ + static int find_free_extent_unclustered(struct btrfs_block_group *bg, + struct find_free_extent_ctl *ffe_ctl) +@@ -3622,25 +3612,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, + offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, + ffe_ctl->num_bytes, ffe_ctl->empty_size, + &ffe_ctl->max_extent_size); +- +- /* +- * If we didn't find a chunk, and we haven't failed on this block group +- * before, and this block group is in the middle of caching and we are +- * ok with waiting, then go ahead and wait for progress to be made, and +- * set @retry_unclustered to true. +- * +- * If @retry_unclustered is true then we've already waited on this +- * block group once and should move on to the next block group. +- */ +- if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && +- ffe_ctl->loop > LOOP_CACHING_NOWAIT) { +- btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + +- ffe_ctl->empty_size); +- ffe_ctl->retry_unclustered = true; +- return -EAGAIN; +- } else if (!offset) { ++ if (!offset) + return 1; +- } + ffe_ctl->found_offset = offset; + return 0; + } +@@ -3654,7 +3627,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, + /* We want to try and use the cluster allocator, so lets look there */ + if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { + ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); +- if (ret >= 0 || ret == -EAGAIN) ++ if (ret >= 0) + return ret; + /* ret == -ENOENT case falls through */ + } +@@ -3873,8 +3846,7 @@ static void release_block_group(struct btrfs_block_group *block_group, + { + switch (ffe_ctl->policy) { + case BTRFS_EXTENT_ALLOC_CLUSTERED: +- ffe_ctl->retry_clustered = false; +- ffe_ctl->retry_unclustered = false; ++ ffe_ctl->retry_uncached = false; + break; + case BTRFS_EXTENT_ALLOC_ZONED: + /* Nothing to do */ +@@ -4225,9 +4197,7 @@ static noinline int find_free_extent(struct btrfs_root *root, + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; +- /* For clustered allocation */ +- ffe_ctl->retry_clustered = false; +- ffe_ctl->retry_unclustered = false; ++ ffe_ctl->retry_uncached = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; +@@ -4378,16 +4348,12 @@ static noinline int find_free_extent(struct btrfs_root *root, + + bg_ret = NULL; + ret = do_allocation(block_group, ffe_ctl, &bg_ret); +- if (ret == 0) { +- if (bg_ret && bg_ret != block_group) { +- btrfs_release_block_group(block_group, +- ffe_ctl->delalloc); +- block_group = bg_ret; +- } +- } else if (ret == -EAGAIN) { +- goto have_block_group; +- } else if (ret > 0) { ++ if (ret > 0) + goto loop; ++ ++ if (bg_ret && bg_ret != block_group) { ++ btrfs_release_block_group(block_group, ffe_ctl->delalloc); ++ block_group = bg_ret; + } + + /* Checks */ +@@ -4428,6 +4394,15 @@ static noinline int find_free_extent(struct btrfs_root *root, + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + break; + loop: ++ if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && ++ !ffe_ctl->retry_uncached) { ++ ffe_ctl->retry_uncached = true; ++ btrfs_wait_block_group_cache_progress(block_group, ++ ffe_ctl->num_bytes + ++ ffe_ctl->empty_cluster + ++ ffe_ctl->empty_size); ++ goto have_block_group; ++ } + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); + cond_resched(); + } +diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h +index 429d5c5700618..6bfba2f22fdd4 100644 +--- a/fs/btrfs/extent-tree.h ++++ b/fs/btrfs/extent-tree.h +@@ -48,16 +48,11 @@ struct find_free_extent_ctl { + int loop; + + /* +- * Whether we're refilling a cluster, if true we need to re-search +- * current block group but don't try to refill the cluster again. ++ * Set to true if we're retrying the allocation on this block group ++ * after waiting for caching progress, this is so that we retry only ++ * once before moving on to another block group. + */ +- bool retry_clustered; +- +- /* +- * Whether we're updating free space cache, if true we need to re-search +- * current block group but don't try updating free space cache again. +- */ +- bool retry_unclustered; ++ bool retry_uncached; + + /* If current block group is cached */ + int cached; +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index db2b33a822fcd..d5c112f6091b1 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5931,20 +5931,24 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) + + static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) + { +- if (dir->index_cnt == (u64)-1) { +- int ret; ++ int ret = 0; + ++ btrfs_inode_lock(dir, 0); ++ if (dir->index_cnt == (u64)-1) { + ret = btrfs_inode_delayed_dir_index_count(dir); + if (ret) { + ret = btrfs_set_inode_index_count(dir); + if (ret) +- return ret; ++ goto out; + } + } + +- *index = dir->index_cnt; ++ /* index_cnt is the index number of next new entry, so decrement it. */ ++ *index = dir->index_cnt - 1; ++out: ++ btrfs_inode_unlock(dir, 0); + +- return 0; ++ return ret; + } + + /* +@@ -5979,6 +5983,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file) + return 0; + } + ++static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ struct btrfs_file_private *private = file->private_data; ++ int ret; ++ ++ ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), ++ &private->last_index); ++ if (ret) ++ return ret; ++ ++ return generic_file_llseek(file, offset, whence); ++} ++ + struct dir_entry { + u64 ino; + u64 offset; +@@ -11059,7 +11076,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { + }; + + static const struct file_operations btrfs_dir_file_operations = { +- .llseek = generic_file_llseek, ++ .llseek = btrfs_dir_llseek, + .read = generic_read_dir, + .iterate_shared = btrfs_real_readdir, + .open = btrfs_opendir, diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h -index e83c4c095041..21b8dfa5d828 100644 +index e83c4c0950417..21b8dfa5d8286 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -48,7 +48,7 @@ extern unsigned int pageblock_order; @@ -14046,7 +17588,7 @@ index e83c4c095041..21b8dfa5d828 100644 #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/kernel/padata.c b/kernel/padata.c -index 222d60195de6..b8e6b7c48746 100644 +index 222d60195de66..b8e6b7c48746e 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -45,7 +45,7 @@ struct padata_mt_job_state { @@ -14067,31 +17609,44 @@ index 222d60195de6..b8e6b7c48746 100644 { struct padata_work *pw = container_of(w, struct padata_work, pw_work); struct padata_mt_job_state *ps = pw->pw_data; -diff --git a/mm/readahead.c b/mm/readahead.c -index a9c999aa19af..797494cec490 100644 ---- a/mm/readahead.c -+++ b/mm/readahead.c -@@ -613,9 +613,17 @@ static void ondemand_readahead(struct readahead_control *ractl, - max_pages); - rcu_read_unlock(); +diff --git a/mm/slab_common.c b/mm/slab_common.c +index d1555ea2981ac..5658da50a2d07 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -479,7 +479,7 @@ void slab_kmem_cache_release(struct kmem_cache *s) -- if (!start || start - index > max_pages) -+ if (!start || start - index - 1 > max_pages) - return; + void kmem_cache_destroy(struct kmem_cache *s) + { +- int refcnt; ++ int err = -EBUSY; + bool rcu_set; -+ /* -+ * If no gaps in the range, page_cache_next_miss() returns -+ * index beyond range. Adjust it back to make sure -+ * ractl->_index is updated correctly later. -+ */ -+ if ((start - index - 1) == max_pages) -+ start--; -+ - ra->start = start; - ra->size = start - index; /* old async_size */ - ra->size += req_size; + if (unlikely(!s) || !kasan_check_byte(s)) +@@ -490,17 +490,17 @@ void kmem_cache_destroy(struct kmem_cache *s) + + rcu_set = s->flags & SLAB_TYPESAFE_BY_RCU; + +- refcnt = --s->refcount; +- if (refcnt) ++ s->refcount--; ++ if (s->refcount) + goto out_unlock; + +- WARN(shutdown_cache(s), +- "%s %s: Slab cache still has objects when called from %pS", ++ err = shutdown_cache(s); ++ WARN(err, "%s %s: Slab cache still has objects when called from %pS", + __func__, s->name, (void *)_RET_IP_); + out_unlock: + mutex_unlock(&slab_mutex); + cpus_read_unlock(); +- if (!refcnt && !rcu_set) ++ if (!err && !rcu_set) + kmem_cache_release(s); + } + EXPORT_SYMBOL(kmem_cache_destroy); diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o -index 0edfdb40364b..ae52d3b3f063 100644 +index 0edfdb40364b8..ae52d3b3f0637 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -19,7 +19,7 @@ quiet_cmd_gen_initcalls_lds = GEN $@ @@ -14104,7 +17659,7 @@ index 0edfdb40364b..ae52d3b3f063 100644 targets := .tmp_initcalls.lds diff --git a/sound/pci/hda/cs35l41_hda.c b/sound/pci/hda/cs35l41_hda.c -index ce5faa620517..1f0f2b8df300 100644 +index ce5faa6205170..1f0f2b8df3005 100644 --- a/sound/pci/hda/cs35l41_hda.c +++ b/sound/pci/hda/cs35l41_hda.c @@ -1235,7 +1235,7 @@ static int cs35l41_no_acpi_dsd(struct cs35l41_hda *cs35l41, struct device *physd @@ -14116,12 +17671,53 @@ index ce5faa620517..1f0f2b8df300 100644 hw_cfg->bst_type = CS35L41_EXT_BOOST; hw_cfg->gpio1.func = CS35l41_VSPK_SWITCH; hw_cfg->gpio1.valid = true; +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index dc7b7a407638a..26f5da23e3bdc 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -9658,6 +9658,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1043, 0x17f3, "ROG Ally RC71L_RC71L", ALC294_FIXUP_ASUS_ALLY), + SND_PCI_QUIRK(0x1043, 0x1881, "ASUS Zephyrus S/M", ALC294_FIXUP_ASUS_GX502_PINS), + SND_PCI_QUIRK(0x1043, 0x18b1, "Asus MJ401TA", ALC256_FIXUP_ASUS_HEADSET_MIC), ++ SND_PCI_QUIRK(0x1043, 0x18d3, "Asus Zenbook", ALC287_FIXUP_CS35L41_I2C_2), + SND_PCI_QUIRK(0x1043, 0x18f1, "Asus FX505DT", ALC256_FIXUP_ASUS_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x194e, "ASUS UX563FD", ALC294_FIXUP_ASUS_HPE), + SND_PCI_QUIRK(0x1043, 0x1970, "ASUS UX550VE", ALC289_FIXUP_ASUS_GA401), +diff --git a/sound/soc/intel/common/soc-acpi-intel-adl-match.c b/sound/soc/intel/common/soc-acpi-intel-adl-match.c +index bcd66e0094b4b..c4b57cca6b228 100644 +--- a/sound/soc/intel/common/soc-acpi-intel-adl-match.c ++++ b/sound/soc/intel/common/soc-acpi-intel-adl-match.c +@@ -648,18 +648,18 @@ struct snd_soc_acpi_mach snd_soc_acpi_intel_adl_sdw_machines[] = { + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-adl-rt1316-l2-mono-rt714-l3.tplg", + }, +- { +- .link_mask = 0x3, /* rt1316 on link1 & rt714 on link0 */ +- .links = adl_sdw_rt1316_link1_rt714_link0, +- .drv_name = "sof_sdw", +- .sof_tplg_filename = "sof-adl-rt1316-l1-mono-rt714-l0.tplg", +- }, + { + .link_mask = 0x7, /* rt714 on link0 & two rt1316s on link1 and link2 */ + .links = adl_sdw_rt1316_link12_rt714_link0, + .drv_name = "sof_sdw", + .sof_tplg_filename = "sof-adl-rt1316-l12-rt714-l0.tplg", + }, ++ { ++ .link_mask = 0x3, /* rt1316 on link1 & rt714 on link0 */ ++ .links = adl_sdw_rt1316_link1_rt714_link0, ++ .drv_name = "sof_sdw", ++ .sof_tplg_filename = "sof-adl-rt1316-l1-mono-rt714-l0.tplg", ++ }, + { + .link_mask = 0x5, /* 2 active links required */ + .links = adl_sdw_rt1316_link2_rt714_link0, -- 2.42.0 -From e4895406f7f12e8bed1293c24931803abb1915c1 Mon Sep 17 00:00:00 2001 +From 0aeaaa0f640430f7144d2d10165b89750651cd3c Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:10:25 +0200 +Date: Wed, 13 Sep 2023 14:33:16 +0200 Subject: [PATCH 6/7] ksm Signed-off-by: Peter Jung @@ -14159,7 +17755,7 @@ Signed-off-by: Peter Jung 30 files changed, 390 insertions(+), 18 deletions(-) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst -index 7626392fe82c..5c5be7bd84b8 100644 +index 7626392fe82cb..5c5be7bd84b81 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -173,6 +173,13 @@ stable_node_chains @@ -14210,7 +17806,7 @@ index 7626392fe82c..5c5be7bd84b8 100644 From the perspective of application, a high ratio of ``ksm_rmap_items`` to ``ksm_merging_pages`` means a bad madvise-applied policy, so developers or diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl -index 1f13995d00d7..4a5bc2a91fa7 100644 +index 1f13995d00d7b..4a5bc2a91fa74 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -491,3 +491,6 @@ @@ -14221,7 +17817,7 @@ index 1f13995d00d7..4a5bc2a91fa7 100644 +563 common process_ksm_disable sys_process_ksm_disable +564 common process_ksm_status sys_process_ksm_status diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl -index 8ebed8a13874..d616dcc060df 100644 +index 8ebed8a138747..d616dcc060df3 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -465,3 +465,6 @@ @@ -14232,7 +17828,7 @@ index 8ebed8a13874..d616dcc060df 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h -index 64a514f90131..63a8a9c4abc1 100644 +index 64a514f90131b..63a8a9c4abc16 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -39,7 +39,7 @@ @@ -14245,7 +17841,7 @@ index 64a514f90131..63a8a9c4abc1 100644 #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h -index d952a28463e0..c99c8260489b 100644 +index d952a28463e01..c99c8260489b8 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -909,6 +909,12 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) @@ -14262,7 +17858,7 @@ index d952a28463e0..c99c8260489b 100644 /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl -index f8c74ffeeefb..735157909c6f 100644 +index f8c74ffeeefbe..735157909c6fb 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -372,3 +372,6 @@ @@ -14273,7 +17869,7 @@ index f8c74ffeeefb..735157909c6f 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl -index 4f504783371f..25b22d311f10 100644 +index 4f504783371fc..25b22d311f108 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -451,3 +451,6 @@ @@ -14284,7 +17880,7 @@ index 4f504783371f..25b22d311f10 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl -index 858d22bf275c..e548c182a33e 100644 +index 858d22bf275c2..e548c182a33ef 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -457,3 +457,6 @@ @@ -14295,7 +17891,7 @@ index 858d22bf275c..e548c182a33e 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl -index 1976317d4e8b..fed21167be44 100644 +index 1976317d4e8b0..fed21167be444 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -390,3 +390,6 @@ @@ -14306,7 +17902,7 @@ index 1976317d4e8b..fed21167be44 100644 +453 n32 process_ksm_disable sys_process_ksm_disable +454 n32 process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl -index cfda2511badf..b27ae871f676 100644 +index cfda2511badf3..b27ae871f676f 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -366,3 +366,6 @@ @@ -14317,7 +17913,7 @@ index cfda2511badf..b27ae871f676 100644 +453 n64 process_ksm_disable sys_process_ksm_disable +454 n64 process_ksm_status sys_process_ksm_status diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl -index 7692234c3768..59f298413c29 100644 +index 7692234c37683..59f298413c292 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -439,3 +439,6 @@ @@ -14328,7 +17924,7 @@ index 7692234c3768..59f298413c29 100644 +453 o32 process_ksm_disable sys_process_ksm_disable +454 o32 process_ksm_status sys_process_ksm_status diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl -index a0a9145b6dd4..494b59d1185f 100644 +index a0a9145b6dd4f..494b59d1185fa 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -450,3 +450,6 @@ @@ -14339,7 +17935,7 @@ index a0a9145b6dd4..494b59d1185f 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl -index 8c0b08b7a80e..499d7b233a43 100644 +index 8c0b08b7a80ec..499d7b233a431 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -538,3 +538,6 @@ @@ -14350,7 +17946,7 @@ index 8c0b08b7a80e..499d7b233a43 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl -index a6935af2235c..97b36ce15155 100644 +index a6935af2235ca..97b36ce151556 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -454,3 +454,6 @@ @@ -14361,7 +17957,7 @@ index a6935af2235c..97b36ce15155 100644 +453 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status sys_process_ksm_status diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl -index 97377e8c5025..bd3827e1fc8d 100644 +index 97377e8c50251..bd3827e1fc8d9 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -454,3 +454,6 @@ @@ -14372,7 +17968,7 @@ index 97377e8c5025..bd3827e1fc8d 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl -index faa835f3c54a..c05e62a0ca02 100644 +index faa835f3c54a5..c05e62a0ca026 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -497,3 +497,6 @@ @@ -14383,7 +17979,7 @@ index faa835f3c54a..c05e62a0ca02 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl -index bc0a3c941b35..c79bd2dd758d 100644 +index bc0a3c941b35c..c79bd2dd758da 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -456,3 +456,6 @@ @@ -14394,7 +17990,7 @@ index bc0a3c941b35..c79bd2dd758d 100644 +453 i386 process_ksm_disable sys_process_ksm_disable +454 i386 process_ksm_status sys_process_ksm_status diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl -index 227538b0ce80..e146a70cc299 100644 +index 227538b0ce801..e146a70cc299f 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -373,6 +373,9 @@ @@ -14408,7 +18004,7 @@ index 227538b0ce80..e146a70cc299 100644 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl -index 2b69c3c035b6..b7bf81a3ba13 100644 +index 2b69c3c035b6a..b7bf81a3ba133 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -422,3 +422,6 @@ @@ -14419,7 +18015,7 @@ index 2b69c3c035b6..b7bf81a3ba13 100644 +453 common process_ksm_disable sys_process_ksm_disable +454 common process_ksm_status sys_process_ksm_status diff --git a/fs/proc/base.c b/fs/proc/base.c -index 9df3f4839662..0fedd0050577 100644 +index ee4b824658a0a..9b0beb26cbd48 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3207,6 +3207,7 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, @@ -14431,7 +18027,7 @@ index 9df3f4839662..0fedd0050577 100644 seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); diff --git a/include/linux/ksm.h b/include/linux/ksm.h -index 899a314bc487..c2dd786a30e1 100644 +index 899a314bc4872..c2dd786a30e1f 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -26,6 +26,22 @@ int ksm_disable(struct mm_struct *mm); @@ -14469,7 +18065,7 @@ index 899a314bc487..c2dd786a30e1 100644 static inline void collect_procs_ksm(struct page *page, struct list_head *to_kill, int force_early) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h -index 7d30dc4ff0ff..d8d8cc1348d6 100644 +index 7d30dc4ff0ff1..d8d8cc1348d6c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -812,7 +812,7 @@ struct mm_struct { @@ -14496,7 +18092,7 @@ index 7d30dc4ff0ff..d8d8cc1348d6 100644 struct { /* this mm_struct is on lru_gen_mm_list */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h -index 03e3d0121d5e..16597dea90f4 100644 +index 03e3d0121d5e3..16597dea90f40 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -813,6 +813,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); @@ -14510,7 +18106,7 @@ index 03e3d0121d5e..16597dea90f4 100644 unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h -index fd6c1cb585db..11d0fc82c437 100644 +index fd6c1cb585db4..11d0fc82c4378 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -820,8 +820,17 @@ __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) @@ -14533,7 +18129,7 @@ index fd6c1cb585db..11d0fc82c437 100644 /* * 32 bit systems traditionally used different diff --git a/kernel/sys.c b/kernel/sys.c -index 2410e3999ebe..b0841a2dd2b7 100644 +index 2410e3999ebe5..b0841a2dd2b7a 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2727,6 +2727,153 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -14691,7 +18287,7 @@ index 2410e3999ebe..b0841a2dd2b7 100644 struct getcpu_cache __user *, unused) { diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c -index 781de7cc6a4e..49a35d35d0f9 100644 +index 781de7cc6a4e1..49a35d35d0f97 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -184,6 +184,9 @@ COND_SYSCALL(mincore); @@ -14705,7 +18301,7 @@ index 781de7cc6a4e..49a35d35d0f9 100644 COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/khugepaged.c b/mm/khugepaged.c -index 78c8d5d8b628..4b8b8673d5d9 100644 +index 78c8d5d8b6284..4b8b8673d5d9f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -19,6 +19,7 @@ @@ -14725,7 +18321,7 @@ index 78c8d5d8b628..4b8b8673d5d9 100644 } else { src_page = pte_page(pteval); diff --git a/mm/ksm.c b/mm/ksm.c -index d7b5b95e936e..6b7b8928fb96 100644 +index d7b5b95e936e9..6b7b8928fb965 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -278,6 +278,9 @@ static unsigned int zero_checksum __read_mostly; @@ -14804,7 +18400,7 @@ index d7b5b95e936e..6b7b8928fb96 100644 #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, diff --git a/mm/memory.c b/mm/memory.c -index cdc4d4c1c858..428943ecda25 100644 +index cdc4d4c1c858a..428943ecda254 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1433,8 +1433,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, @@ -14828,7 +18424,7 @@ index cdc4d4c1c858..428943ecda25 100644 } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c -index 26853badae70..0de9d33cd565 100644 +index 26853badae705..0de9d33cd565d 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -29,6 +29,8 @@ @@ -14969,9 +18565,9 @@ index 26853badae70..0de9d33cd565 100644 -- 2.42.0 -From 49274c8196e04f14f8af83a59ff82e2ae00ac21b Mon Sep 17 00:00:00 2001 +From 0de12707e489714de0eb3d25f004f02c01436204 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 10 Jul 2023 17:11:55 +0200 +Date: Wed, 13 Sep 2023 14:33:28 +0200 Subject: [PATCH 7/7] zstd Signed-off-by: Peter Jung @@ -15039,7 +18635,7 @@ Signed-off-by: Peter Jung create mode 100644 lib/zstd/common/bits.h diff --git a/include/linux/zstd.h b/include/linux/zstd.h -index 113408eef6ec..f109d49f43f8 100644 +index 113408eef6ece..f109d49f43f80 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -1,6 +1,6 @@ @@ -15051,7 +18647,7 @@ index 113408eef6ec..f109d49f43f8 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h -index 58b6dd45a969..6d5cf55f0bf3 100644 +index 58b6dd45a969f..6d5cf55f0bf3e 100644 --- a/include/linux/zstd_errors.h +++ b/include/linux/zstd_errors.h @@ -1,5 +1,6 @@ @@ -15117,7 +18713,7 @@ index 58b6dd45a969..6d5cf55f0bf3 100644 } ZSTD_ErrorCode; diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h -index 79d55465d5c1..8b4ffe649df5 100644 +index 79d55465d5c1d..8b4ffe649df57 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -1,5 +1,6 @@ @@ -16309,7 +19905,7 @@ index 79d55465d5c1..8b4ffe649df5 100644 #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile -index 20f08c644b71..464c410b2768 100644 +index 20f08c644b71a..464c410b2768c 100644 --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -1,6 +1,6 @@ @@ -16322,7 +19918,7 @@ index 20f08c644b71..464c410b2768 100644 # This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h new file mode 100644 -index 000000000000..05adbbeccaa9 +index 0000000000000..05adbbeccaa9b --- /dev/null +++ b/lib/zstd/common/allocations.h @@ -0,0 +1,56 @@ @@ -16384,7 +19980,7 @@ index 000000000000..05adbbeccaa9 +#endif /* ZSTD_ALLOCATIONS_H */ diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h new file mode 100644 -index 000000000000..aa3487ec4b6a +index 0000000000000..aa3487ec4b6a7 --- /dev/null +++ b/lib/zstd/common/bits.h @@ -0,0 +1,149 @@ @@ -16538,7 +20134,7 @@ index 000000000000..aa3487ec4b6a + +#endif /* ZSTD_BITS_H */ diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h -index feef3a1b1d60..444dc4f85c64 100644 +index feef3a1b1d600..444dc4f85c649 100644 --- a/lib/zstd/common/bitstream.h +++ b/lib/zstd/common/bitstream.h @@ -1,7 +1,8 @@ @@ -16665,7 +20261,7 @@ index feef3a1b1d60..444dc4f85c64 100644 if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ return BIT_DStream_overflow; diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h -index c42d39faf9bd..c437e0975575 100644 +index c42d39faf9bd8..c437e09755750 100644 --- a/lib/zstd/common/compiler.h +++ b/lib/zstd/common/compiler.h @@ -1,5 +1,6 @@ @@ -16695,7 +20291,7 @@ index c42d39faf9bd..c437e0975575 100644 #endif /* ZSTD_COMPILER_H */ diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h -index 0db7b42407ee..d8319a2bef4c 100644 +index 0db7b42407eea..d8319a2bef4ce 100644 --- a/lib/zstd/common/cpu.h +++ b/lib/zstd/common/cpu.h @@ -1,5 +1,6 @@ @@ -16707,7 +20303,7 @@ index 0db7b42407ee..d8319a2bef4c 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c -index bb863c9ea616..e56ff6464e91 100644 +index bb863c9ea6164..e56ff6464e918 100644 --- a/lib/zstd/common/debug.c +++ b/lib/zstd/common/debug.c @@ -1,7 +1,8 @@ @@ -16721,7 +20317,7 @@ index bb863c9ea616..e56ff6464e91 100644 * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h -index 6dd88d1fbd02..da0dbfc614b8 100644 +index 6dd88d1fbd02c..da0dbfc614b88 100644 --- a/lib/zstd/common/debug.h +++ b/lib/zstd/common/debug.h @@ -1,7 +1,8 @@ @@ -16735,7 +20331,7 @@ index 6dd88d1fbd02..da0dbfc614b8 100644 * You can contact the author at : * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c -index fef67056f052..6cdd82233fb5 100644 +index fef67056f0524..6cdd82233fb59 100644 --- a/lib/zstd/common/entropy_common.c +++ b/lib/zstd/common/entropy_common.c @@ -1,6 +1,7 @@ @@ -16853,7 +20449,7 @@ index fef67056f052..6cdd82233fb5 100644 return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); } diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c -index 6d1135f8c373..a4062d30d170 100644 +index 6d1135f8c3733..a4062d30d1703 100644 --- a/lib/zstd/common/error_private.c +++ b/lib/zstd/common/error_private.c @@ -1,5 +1,6 @@ @@ -16901,7 +20497,7 @@ index 6d1135f8c373..a4062d30d170 100644 default: return notErrorCode; } diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h -index ca5101e542fa..9a4699a38a88 100644 +index ca5101e542faa..9a4699a38a881 100644 --- a/lib/zstd/common/error_private.h +++ b/lib/zstd/common/error_private.h @@ -1,5 +1,6 @@ @@ -16913,7 +20509,7 @@ index ca5101e542fa..9a4699a38a88 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h -index 4507043b2287..c4e25a219142 100644 +index 4507043b2287c..c4e25a2191429 100644 --- a/lib/zstd/common/fse.h +++ b/lib/zstd/common/fse.h @@ -1,7 +1,8 @@ @@ -17065,7 +20661,7 @@ index 4507043b2287..c4e25a219142 100644 * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c -index a0d06095be83..45cf457f31ef 100644 +index a0d06095be83d..45cf457f31ef8 100644 --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -1,6 +1,7 @@ @@ -17225,7 +20821,7 @@ index a0d06095be83..45cf457f31ef 100644 - #endif /* FSE_COMMONDEFS_ONLY */ diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h -index 5042ff870308..8e7943092ed1 100644 +index 5042ff8703087..8e7943092ed1a 100644 --- a/lib/zstd/common/huf.h +++ b/lib/zstd/common/huf.h @@ -1,7 +1,8 @@ @@ -17552,7 +21148,7 @@ index 5042ff870308..8e7943092ed1 100644 +#endif /* HUF_H_298734234 */ diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h -index 1d9cc03924ca..a7231822b6e3 100644 +index 1d9cc03924ca9..a7231822b6e32 100644 --- a/lib/zstd/common/mem.h +++ b/lib/zstd/common/mem.h @@ -1,6 +1,6 @@ @@ -17564,7 +21160,7 @@ index 1d9cc03924ca..a7231822b6e3 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h -index 0e3b2c0a527d..7ede8cf1ffe5 100644 +index 0e3b2c0a527db..7ede8cf1ffe57 100644 --- a/lib/zstd/common/portability_macros.h +++ b/lib/zstd/common/portability_macros.h @@ -1,5 +1,6 @@ @@ -17618,7 +21214,7 @@ index 0e3b2c0a527d..7ede8cf1ffe5 100644 + #endif /* ZSTD_PORTABILITY_MACROS_H */ diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c -index 3d7e35b309b5..44b95b25344a 100644 +index 3d7e35b309b5d..44b95b25344a1 100644 --- a/lib/zstd/common/zstd_common.c +++ b/lib/zstd/common/zstd_common.c @@ -1,5 +1,6 @@ @@ -17676,7 +21272,7 @@ index 3d7e35b309b5..44b95b25344a 100644 - } -} diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h -index 2c34e8a33a1c..670c5fa2a952 100644 +index 2c34e8a33a1c1..670c5fa2a952d 100644 --- a/lib/zstd/common/zstd_deps.h +++ b/lib/zstd/common/zstd_deps.h @@ -1,6 +1,6 @@ @@ -17710,7 +21306,7 @@ index 2c34e8a33a1c..670c5fa2a952 100644 +#endif /* ZSTD_DEPS_STDINT */ +#endif /* ZSTD_DEPS_NEED_STDINT */ diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h -index 93305d9b41bb..7f023e4d4774 100644 +index 93305d9b41bba..7f023e4d47740 100644 --- a/lib/zstd/common/zstd_internal.h +++ b/lib/zstd/common/zstd_internal.h @@ -1,5 +1,6 @@ @@ -17896,7 +21492,7 @@ index 93305d9b41bb..7f023e4d4774 100644 /* ZSTD_invalidateRepCodes() : diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h -index d9a76112ec3a..6ab8be6532ef 100644 +index d9a76112ec3af..6ab8be6532efc 100644 --- a/lib/zstd/compress/clevels.h +++ b/lib/zstd/compress/clevels.h @@ -1,5 +1,6 @@ @@ -17908,7 +21504,7 @@ index d9a76112ec3a..6ab8be6532ef 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c -index ec5b1ca6d71a..e46ca6621b48 100644 +index ec5b1ca6d71af..e46ca6621b488 100644 --- a/lib/zstd/compress/fse_compress.c +++ b/lib/zstd/compress/fse_compress.c @@ -1,6 +1,7 @@ @@ -18027,7 +21623,7 @@ index ec5b1ca6d71a..e46ca6621b48 100644 - #endif /* FSE_COMMONDEFS_ONLY */ diff --git a/lib/zstd/compress/hist.c b/lib/zstd/compress/hist.c -index 3ddc6dfb6894..0b12587cc14b 100644 +index 3ddc6dfb68948..0b12587cc14b1 100644 --- a/lib/zstd/compress/hist.c +++ b/lib/zstd/compress/hist.c @@ -1,7 +1,8 @@ @@ -18041,7 +21637,7 @@ index 3ddc6dfb6894..0b12587cc14b 100644 * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/compress/hist.h b/lib/zstd/compress/hist.h -index fc1830abc9c6..f7687b0fc20a 100644 +index fc1830abc9c63..f7687b0fc20a0 100644 --- a/lib/zstd/compress/hist.h +++ b/lib/zstd/compress/hist.h @@ -1,7 +1,8 @@ @@ -18055,7 +21651,7 @@ index fc1830abc9c6..f7687b0fc20a 100644 * You can contact the author at : * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compress.c -index 74ef0db47621..83241abafe35 100644 +index 74ef0db476210..83241abafe35e 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -1,6 +1,7 @@ @@ -18812,7 +22408,7 @@ index 74ef0db47621..83241abafe35 100644 } - diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c -index f620cafca633..c1c316e9e289 100644 +index f620cafca633b..c1c316e9e289f 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -1,5 +1,6 @@ @@ -22194,7 +25790,7 @@ index f620cafca633..c1c316e9e289 100644 + } +} diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h -index 71697a11ae30..899f5e2de8e9 100644 +index 71697a11ae305..899f5e2de8e96 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -1,5 +1,6 @@ @@ -22748,7 +26344,7 @@ index 71697a11ae30..899f5e2de8e9 100644 + #endif /* ZSTD_COMPRESS_H */ diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c -index 52b0a8059aba..3e9ea46a670a 100644 +index 52b0a8059aba9..3e9ea46a670a6 100644 --- a/lib/zstd/compress/zstd_compress_literals.c +++ b/lib/zstd/compress/zstd_compress_literals.c @@ -1,5 +1,6 @@ @@ -22990,7 +26586,7 @@ index 52b0a8059aba..3e9ea46a670a 100644 MEM_writeLE32(ostart, lhc); ostart[4] = (BYTE)(cLitSize >> 10); diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h -index 9775fb97cb70..a2a85d6b69e5 100644 +index 9775fb97cb702..a2a85d6b69e53 100644 --- a/lib/zstd/compress/zstd_compress_literals.h +++ b/lib/zstd/compress/zstd_compress_literals.h @@ -1,5 +1,6 @@ @@ -23034,7 +26630,7 @@ index 9775fb97cb70..a2a85d6b69e5 100644 #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c -index 21ddc1b37acf..5c028c78d889 100644 +index 21ddc1b37acf8..5c028c78d889b 100644 --- a/lib/zstd/compress/zstd_compress_sequences.c +++ b/lib/zstd/compress/zstd_compress_sequences.c @@ -1,5 +1,6 @@ @@ -23064,7 +26660,7 @@ index 21ddc1b37acf..5c028c78d889 100644 * If basic encoding isn't possible, always choose RLE. */ diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h -index 7991364c2f71..7fe6f4ff5cf2 100644 +index 7991364c2f71f..7fe6f4ff5cf25 100644 --- a/lib/zstd/compress/zstd_compress_sequences.h +++ b/lib/zstd/compress/zstd_compress_sequences.h @@ -1,5 +1,6 @@ @@ -23076,7 +26672,7 @@ index 7991364c2f71..7fe6f4ff5cf2 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c -index 17d836cc84e8..dbacbaf72733 100644 +index 17d836cc84e8f..dbacbaf727338 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -1,5 +1,6 @@ @@ -23175,7 +26771,7 @@ index 17d836cc84e8..dbacbaf72733 100644 ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); } diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h -index 224ece79546e..826bbc9e029b 100644 +index 224ece79546eb..826bbc9e029b1 100644 --- a/lib/zstd/compress/zstd_compress_superblock.h +++ b/lib/zstd/compress/zstd_compress_superblock.h @@ -1,5 +1,6 @@ @@ -23187,7 +26783,7 @@ index 224ece79546e..826bbc9e029b 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h -index 349fc923c355..65ea53b62844 100644 +index 349fc923c355a..65ea53b628447 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -1,5 +1,6 @@ @@ -23492,7 +27088,7 @@ index 349fc923c355..65ea53b62844 100644 diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c -index 76933dea2624..ab9440a99603 100644 +index 76933dea2624e..ab9440a996039 100644 --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -1,5 +1,6 @@ @@ -23805,7 +27401,7 @@ index 76933dea2624..ab9440a99603 100644 hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; ip += repLength2; diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h -index 6822bde65a1d..0204f12e4cf7 100644 +index 6822bde65a1d8..0204f12e4cf70 100644 --- a/lib/zstd/compress/zstd_double_fast.h +++ b/lib/zstd/compress/zstd_double_fast.h @@ -1,5 +1,6 @@ @@ -23827,7 +27423,7 @@ index 6822bde65a1d..0204f12e4cf7 100644 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c -index a752e6beab52..3399b39c5dbc 100644 +index a752e6beab52e..3399b39c5dbc5 100644 --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -1,5 +1,6 @@ @@ -24610,7 +28206,7 @@ index a752e6beab52..3399b39c5dbc 100644 { default: /* includes case 3 */ diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h -index fddc2f532d21..e64d9e1b2d39 100644 +index fddc2f532d21d..e64d9e1b2d393 100644 --- a/lib/zstd/compress/zstd_fast.h +++ b/lib/zstd/compress/zstd_fast.h @@ -1,5 +1,6 @@ @@ -24632,7 +28228,7 @@ index fddc2f532d21..e64d9e1b2d39 100644 ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c -index 0298a01a7504..f6b4978ceba7 100644 +index 0298a01a7504a..f6b4978ceba7f 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -1,5 +1,6 @@ @@ -25695,7 +29291,7 @@ index 0298a01a7504..f6b4978ceba7 100644 return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); } diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h -index e5bdf4df8dde..9505bed93c03 100644 +index e5bdf4df8dde0..9505bed93c031 100644 --- a/lib/zstd/compress/zstd_lazy.h +++ b/lib/zstd/compress/zstd_lazy.h @@ -1,5 +1,6 @@ @@ -25725,7 +29321,7 @@ index e5bdf4df8dde..9505bed93c03 100644 #endif /* ZSTD_LAZY_H */ diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c -index dd86fc83e7dd..b7da76b0db7c 100644 +index dd86fc83e7dde..b7da76b0db7c4 100644 --- a/lib/zstd/compress/zstd_ldm.c +++ b/lib/zstd/compress/zstd_ldm.c @@ -1,5 +1,6 @@ @@ -25769,7 +29365,7 @@ index dd86fc83e7dd..b7da76b0db7c 100644 ip += sequence.matchLength; } diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h -index fbc6a5e88fd7..c540731abde7 100644 +index fbc6a5e88fd7a..c540731abde72 100644 --- a/lib/zstd/compress/zstd_ldm.h +++ b/lib/zstd/compress/zstd_ldm.h @@ -1,5 +1,6 @@ @@ -25781,7 +29377,7 @@ index fbc6a5e88fd7..c540731abde7 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h -index 647f865be290..cfccfc46f6f7 100644 +index 647f865be2903..cfccfc46f6f7b 100644 --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -1,5 +1,6 @@ @@ -25793,7 +29389,7 @@ index 647f865be290..cfccfc46f6f7 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c -index fd82acfda62f..1e41cb04f482 100644 +index fd82acfda62f6..1e41cb04f4820 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -1,5 +1,6 @@ @@ -26275,7 +29871,7 @@ index fd82acfda62f..1e41cb04f482 100644 ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); } diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h -index 22b862858ba7..faa73ff4b03d 100644 +index 22b862858ba7a..faa73ff4b03dc 100644 --- a/lib/zstd/compress/zstd_opt.h +++ b/lib/zstd/compress/zstd_opt.h @@ -1,5 +1,6 @@ @@ -26287,7 +29883,7 @@ index 22b862858ba7..faa73ff4b03d 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c -index 60958afebc41..d172e35fbd9a 100644 +index 60958afebc415..d172e35fbd9a6 100644 --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -1,7 +1,8 @@ @@ -27478,7 +31074,7 @@ index 60958afebc41..d172e35fbd9a 100644 } - diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c -index dbbc7919de53..30ef65e1ab5c 100644 +index dbbc7919de534..30ef65e1ab5ca 100644 --- a/lib/zstd/decompress/zstd_ddict.c +++ b/lib/zstd/decompress/zstd_ddict.c @@ -1,5 +1,6 @@ @@ -27520,7 +31116,7 @@ index dbbc7919de53..30ef65e1ab5c 100644 + return ddict->dictID; } diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h -index 8c1a79d666f8..de459a0dacd1 100644 +index 8c1a79d666f89..de459a0dacd19 100644 --- a/lib/zstd/decompress/zstd_ddict.h +++ b/lib/zstd/decompress/zstd_ddict.h @@ -1,5 +1,6 @@ @@ -27532,7 +31128,7 @@ index 8c1a79d666f8..de459a0dacd1 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c -index 6b3177c94711..03dbdf39109f 100644 +index 6b3177c947114..03dbdf39109f9 100644 --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c @@ -1,5 +1,6 @@ @@ -28089,7 +31685,7 @@ index 6b3177c94711..03dbdf39109f 100644 + } } diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c -index c1913b8e7c89..9f5577e5bc19 100644 +index c1913b8e7c897..9f5577e5bc19d 100644 --- a/lib/zstd/decompress/zstd_decompress_block.c +++ b/lib/zstd/decompress/zstd_decompress_block.c @@ -1,5 +1,6 @@ @@ -28626,7 +32222,7 @@ index c1913b8e7c89..9f5577e5bc19 100644 + return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); +} diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h -index 3d2d57a5d25a..5888e6cc788b 100644 +index 3d2d57a5d25a7..5888e6cc788b5 100644 --- a/lib/zstd/decompress/zstd_decompress_block.h +++ b/lib/zstd/decompress/zstd_decompress_block.h @@ -1,5 +1,6 @@ @@ -28649,7 +32245,7 @@ index 3d2d57a5d25a..5888e6cc788b 100644 #endif /* ZSTD_DEC_BLOCK_H */ diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h -index 98102edb6a83..32f79fb2873d 100644 +index 98102edb6a832..32f79fb2873df 100644 --- a/lib/zstd/decompress/zstd_decompress_internal.h +++ b/lib/zstd/decompress/zstd_decompress_internal.h @@ -1,5 +1,6 @@ @@ -28684,7 +32280,7 @@ index 98102edb6a83..32f79fb2873d 100644 /* streaming */ ZSTD_dStreamStage streamStage; diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h -index a06ca187aab5..8a47eb2a4514 100644 +index a06ca187aab5f..8a47eb2a45145 100644 --- a/lib/zstd/decompress_sources.h +++ b/lib/zstd/decompress_sources.h @@ -1,6 +1,6 @@ @@ -28696,7 +32292,7 @@ index a06ca187aab5..8a47eb2a4514 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c -index 22686e367e6f..466828e35752 100644 +index 22686e367e6f0..466828e357525 100644 --- a/lib/zstd/zstd_common_module.c +++ b/lib/zstd/zstd_common_module.c @@ -1,6 +1,6 @@ @@ -28718,7 +32314,7 @@ index 22686e367e6f..466828e35752 100644 MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("Zstd Common"); diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c -index 04e1b5c01d9b..8ecf43226af2 100644 +index 04e1b5c01d9b6..8ecf43226af2f 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -1,6 +1,6 @@ @@ -28730,7 +32326,7 @@ index 04e1b5c01d9b..8ecf43226af2 100644 * * This source code is licensed under both the BSD-style license (found in the diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c -index f4ed952ed485..eb1c49e69722 100644 +index f4ed952ed4852..eb1c49e69722f 100644 --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -1,6 +1,6 @@ diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index 710de28..5991145 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,7 +1,7 @@ -From 9a3788351b1bc830a28d7a51740d2ee964ab8319 Mon Sep 17 00:00:00 2001 +From 410781f5dc233cd65c650a63eb9470aaa45daa7d Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 28 Aug 2023 14:04:00 +0200 -Subject: [PATCH] EEVDF +Date: Tue, 19 Sep 2023 14:32:19 +0200 +Subject: [PATCH] EEVDF-cachy Signed-off-by: Peter Jung --- @@ -476,7 +476,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 2c335df30171..e0a4c13dab04 100644 +index ff3386691626..d49c0f979232 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ @@ -1709,7 +1709,7 @@ index 2c335df30171..e0a4c13dab04 100644 #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { -@@ -7816,18 +7870,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) +@@ -7813,18 +7867,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { struct sched_entity *se = &p->se; @@ -1728,7 +1728,7 @@ index 2c335df30171..e0a4c13dab04 100644 if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); -@@ -7865,66 +7907,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -7862,66 +7904,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ @@ -1795,7 +1795,7 @@ index 2c335df30171..e0a4c13dab04 100644 static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -7936,12 +7918,6 @@ static void set_next_buddy(struct sched_entity *se) +@@ -7933,12 +7915,6 @@ static void set_next_buddy(struct sched_entity *se) } } @@ -1808,7 +1808,7 @@ index 2c335df30171..e0a4c13dab04 100644 /* * Preempt the current task with a newly woken task if needed: */ -@@ -7950,7 +7926,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7947,7 +7923,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1816,7 +1816,7 @@ index 2c335df30171..e0a4c13dab04 100644 int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; -@@ -7966,7 +7941,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7963,7 +7938,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; @@ -1825,7 +1825,7 @@ index 2c335df30171..e0a4c13dab04 100644 set_next_buddy(pse); next_buddy_marked = 1; } -@@ -8011,35 +7986,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -8008,35 +7983,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; @@ -1868,7 +1868,7 @@ index 2c335df30171..e0a4c13dab04 100644 } #ifdef CONFIG_SMP -@@ -8240,8 +8199,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +@@ -8237,8 +8196,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple @@ -1877,7 +1877,7 @@ index 2c335df30171..e0a4c13dab04 100644 */ static void yield_task_fair(struct rq *rq) { -@@ -8257,21 +8214,19 @@ static void yield_task_fair(struct rq *rq) +@@ -8254,21 +8211,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); @@ -1911,7 +1911,7 @@ index 2c335df30171..e0a4c13dab04 100644 } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) -@@ -8514,8 +8469,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) +@@ -8511,8 +8466,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && @@ -1921,7 +1921,7 @@ index 2c335df30171..e0a4c13dab04 100644 return 1; if (sysctl_sched_migration_cost == -1) -@@ -12025,8 +11979,8 @@ static void rq_offline_fair(struct rq *rq) +@@ -12022,8 +11976,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { @@ -1931,7 +1931,7 @@ index 2c335df30171..e0a4c13dab04 100644 return (rtime * min_nr_tasks > slice); } -@@ -12182,8 +12136,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +@@ -12179,8 +12133,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { @@ -1941,7 +1941,7 @@ index 2c335df30171..e0a4c13dab04 100644 struct rq *rq = this_rq(); struct rq_flags rf; -@@ -12192,22 +12146,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12189,22 +12143,9 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; @@ -1966,7 +1966,7 @@ index 2c335df30171..e0a4c13dab04 100644 rq_unlock(rq, &rf); } -@@ -12236,34 +12177,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -12233,34 +12174,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } @@ -2001,7 +2001,7 @@ index 2c335df30171..e0a4c13dab04 100644 #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it -@@ -12334,16 +12247,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) +@@ -12331,16 +12244,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2018,7 +2018,7 @@ index 2c335df30171..e0a4c13dab04 100644 detach_entity_cfs_rq(se); } -@@ -12351,12 +12254,8 @@ static void detach_task_cfs_rq(struct task_struct *p) +@@ -12348,12 +12251,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2031,7 +2031,7 @@ index 2c335df30171..e0a4c13dab04 100644 } static void switched_from_fair(struct rq *rq, struct task_struct *p) -@@ -12467,6 +12366,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +@@ -12464,6 +12363,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; @@ -2039,7 +2039,7 @@ index 2c335df30171..e0a4c13dab04 100644 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); -@@ -12565,6 +12465,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +@@ -12562,6 +12462,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; @@ -2049,7 +2049,7 @@ index 2c335df30171..e0a4c13dab04 100644 /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; -@@ -12695,6 +12598,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) +@@ -12692,6 +12595,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } @@ -2079,7 +2079,7 @@ index 2c335df30171..e0a4c13dab04 100644 #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } -@@ -12721,7 +12647,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task +@@ -12718,7 +12644,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) diff --git a/patches/0002-eevdfbore.patch b/patches/0002-eevdfbore.patch index f1133a7..e9a20e8 100644 --- a/patches/0002-eevdfbore.patch +++ b/patches/0002-eevdfbore.patch @@ -1,20 +1,19 @@ -From f353b9eb23586e55b99a6bfe7da9563be5fcca29 Mon Sep 17 00:00:00 2001 +From c92de794ee60ef526ca33cfee59e96f0d95b0697 Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Sat, 12 Aug 2023 21:05:20 +0200 +Date: Sun, 17 Sep 2023 22:51:20 +0200 Subject: [PATCH] bore-eevdf Signed-off-by: Piotr Gorski --- - include/linux/sched.h | 29 ++++++ - init/Kconfig | 20 ++++ - kernel/sched/core.c | 122 +++++++++++++++++++++++ - kernel/sched/debug.c | 4 + - kernel/sched/fair.c | 219 +++++++++++++++++++++++++++++++++++++++--- - kernel/sched/sched.h | 1 + - 6 files changed, 384 insertions(+), 11 deletions(-) + include/linux/sched.h | 31 ++++++++ + init/Kconfig | 20 +++++ + kernel/sched/core.c | 160 +++++++++++++++++++++++++++++++++++++++ + kernel/sched/debug.c | 3 + + kernel/sched/fair.c | 169 +++++++++++++++++++++++++++++++++++++++++- + 5 files changed, 381 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index c940c4dc8..984931de0 100644 +index c940c4dc8..e88251927 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -545,6 +545,24 @@ struct sched_statistics { @@ -55,7 +54,7 @@ index c940c4dc8..984931de0 100644 s64 vlag; u64 slice; -@@ -990,6 +1014,11 @@ struct task_struct { +@@ -990,6 +1014,13 @@ struct task_struct { struct list_head children; struct list_head sibling; struct task_struct *group_leader; @@ -63,15 +62,17 @@ index c940c4dc8..984931de0 100644 + u16 child_burst_cache; + u16 child_burst_count_cache; + u64 child_burst_last_cached; ++ u16 group_burst_cache; ++ u64 group_burst_last_cached; +#endif // CONFIG_SCHED_BORE /* * 'ptraced' is the list of tasks this task is using ptrace() on. diff --git a/init/Kconfig b/init/Kconfig -index 71755cc8e..c697be79e 100644 +index 25193a9d5..acc02af18 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1277,6 +1277,26 @@ config CHECKPOINT_RESTORE +@@ -1278,6 +1278,26 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -99,10 +100,10 @@ index 71755cc8e..c697be79e 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index aff81e124..a4eba9e47 100644 +index aff81e124..2cc47b723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4491,6 +4491,117 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4491,6 +4491,155 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } @@ -115,6 +116,8 @@ index aff81e124..a4eba9e47 100644 + init_task.child_burst_cache = 0; + init_task.child_burst_count_cache = 0; + init_task.child_burst_last_cached = 0; ++ init_task.group_burst_cache = 0; ++ init_task.group_burst_last_cached = 0; + init_task.se.burst_time = 0; + init_task.se.prev_burst_penalty = 0; + init_task.se.curr_burst_penalty = 0; @@ -125,6 +128,8 @@ index aff81e124..a4eba9e47 100644 + p->child_burst_cache = 0; + p->child_burst_count_cache = 0; + p->child_burst_last_cached = 0; ++ p->group_burst_cache = 0; ++ p->group_burst_last_cached = 0; + p->se.burst_time = 0; + p->se.curr_burst_penalty = 0; +} @@ -140,6 +145,10 @@ index aff81e124..a4eba9e47 100644 + return (p->child_burst_last_cached + sched_burst_cache_lifetime < now); +} + ++static inline bool group_burst_cache_expired(struct task_struct *p, u64 now) { ++ return (p->group_burst_last_cached + sched_burst_cache_lifetime < now); ++} ++ +static void __update_child_burst_cache( + struct task_struct *p, u32 cnt, u32 sum, u64 now) { + u16 avg = 0; @@ -191,36 +200,66 @@ index aff81e124..a4eba9e47 100644 + *asum += sum; +} + ++static void update_group_burst_cache(struct task_struct *p, u64 now) { ++ struct task_struct *member; ++ u32 cnt = 0, sum = 0; ++ u16 avg = 0; ++ ++ for_each_thread(p, member) { ++ cnt++; ++ sum += member->se.burst_penalty; ++ } ++ ++ if (cnt) avg = DIV_ROUND_CLOSEST(sum, cnt); ++ p->group_burst_cache = max(avg, p->se.burst_penalty); ++ p->group_burst_last_cached = now; ++} ++ ++#define forked_task_is_process(p) (p->pid == p->tgid) ++ +static void fork_burst_penalty(struct task_struct *p) { + struct sched_entity *se = &p->se; -+ struct task_struct *anc = p->real_parent; ++ struct task_struct *anc; + u64 now = ktime_get_ns(); -+ u32 cnt = 0; -+ u32 sum = 0; ++ u32 cnt = 0, sum = 0; ++ u16 burst_cache; + -+ read_lock(&tasklist_lock); -+ -+ if (likely(sched_bore) && likely(sched_burst_fork_atavistic)) { -+ while ((anc->real_parent != anc) && (count_child_tasks(anc) == 1)) -+ anc = anc->real_parent; -+ if (child_burst_cache_expired(anc, now)) -+ update_child_burst_cache_atavistic( -+ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); -+ } else -+ if (child_burst_cache_expired(anc, now)) -+ update_child_burst_cache(anc, now); ++ if (likely(sched_bore)) { ++ read_lock(&tasklist_lock); + -+ read_unlock(&tasklist_lock); ++ if (forked_task_is_process(p)) { ++ anc = p->real_parent; ++ if (likely(sched_burst_fork_atavistic)) { ++ while ((anc->real_parent != anc) && ++ (count_child_tasks(anc) == 1)) ++ anc = anc->real_parent; ++ if (child_burst_cache_expired(anc, now)) ++ update_child_burst_cache_atavistic( ++ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); ++ } else ++ if (child_burst_cache_expired(anc, now)) ++ update_child_burst_cache(anc, now); + -+ se->burst_penalty = se->prev_burst_penalty = -+ max(se->prev_burst_penalty, anc->child_burst_cache); ++ burst_cache = anc->child_burst_cache; ++ } else { ++ anc = p->group_leader; ++ if (group_burst_cache_expired(anc, now)) ++ update_group_burst_cache(anc, now); ++ ++ burst_cache = anc->group_burst_cache; ++ } ++ ++ read_unlock(&tasklist_lock); ++ se->prev_burst_penalty = max(se->prev_burst_penalty, burst_cache); ++ } ++ se->burst_penalty = se->prev_burst_penalty; +} +#endif // CONFIG_SCHED_BORE + /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4507,6 +4618,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4507,6 +4656,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -230,7 +269,7 @@ index aff81e124..a4eba9e47 100644 p->se.vlag = 0; INIT_LIST_HEAD(&p->se.group_node); -@@ -4828,6 +4942,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +@@ -4828,6 +4980,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) void sched_post_fork(struct task_struct *p) { @@ -240,31 +279,23 @@ index aff81e124..a4eba9e47 100644 uclamp_post_fork(p); } -@@ -9954,6 +10071,11 @@ void __init sched_init(void) +@@ -9954,6 +10109,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 3.1.2 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 3.1.4 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index e7e83181f..ff41a524c 100644 +index e7e83181f..6ebd52247 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -348,6 +348,7 @@ static __init int sched_init_debug(void) - #endif - - debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); -+ debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity); - - debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); - debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -594,6 +594,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); @@ -275,7 +306,7 @@ index e7e83181f..ff41a524c 100644 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 461409c0e..1293fe037 100644 +index d49c0f979..dffee0766 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -288,49 +319,13 @@ index 461409c0e..1293fe037 100644 */ #include #include -@@ -66,17 +69,17 @@ - * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) - * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus - * -- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) -+ * (default SCHED_TUNABLESCALING_NONE = *1) - */ --unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; -+unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +@@ -86,6 +89,67 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - /* - * Minimal preemption granularity for CPU-bound tasks: - * -- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) -+ * (default: 3 msec * 1, units: nanoseconds) - */ --unsigned int sysctl_sched_base_slice = 750000ULL; --static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -+unsigned int sysctl_sched_base_slice = 3000000ULL; -+static unsigned int normalized_sysctl_sched_base_slice = 3000000ULL; - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -84,8 +87,85 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; - */ - unsigned int sysctl_sched_child_runs_first __read_mostly; - -+/* -+ * SCHED_OTHER wake-up granularity. -+ * -+ * This option delays the preemption effects of decoupled workloads -+ * and reduces their over-scheduling. Synchronous workloads will still -+ * have immediate wakeup/sleep latencies. -+ * -+ * (default: 1.6 msec * 1, units: nanoseconds) -+ */ -+unsigned int sysctl_sched_wakeup_granularity = 1600000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1600000UL; -+ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SCHED_BORE +unsigned int __read_mostly sched_bore = 1; ++unsigned int __read_mostly sched_bore_extra_flags = 0; +unsigned int __read_mostly sched_burst_cache_lifetime = 60000000; +unsigned int __read_mostly sched_burst_penalty_offset = 22; +unsigned int __read_mostly sched_burst_penalty_scale = 1366; @@ -387,17 +382,12 @@ index 461409c0e..1293fe037 100644 + se->curr_burst_penalty = 0; + se->burst_time = 0; +} -+ -+static inline void vruntime_backstep(s64 *vdiff, struct sched_entity *se) { -+ u64 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime; -+ *vdiff += delta_exec - penalty_scale(delta_exec, se, false); -+} +#endif // CONFIG_SCHED_BORE + int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -145,6 +225,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -145,6 +209,78 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -412,6 +402,15 @@ index 461409c0e..1293fe037 100644 + .extra2 = SYSCTL_ONE, + }, + { ++ .procname = "sched_bore_extra_flags", ++ .data = &sched_bore_extra_flags, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { + .procname = "sched_burst_cache_lifetime", + .data = &sched_burst_cache_lifetime, + .maxlen = sizeof(unsigned int), @@ -467,20 +466,14 @@ index 461409c0e..1293fe037 100644 { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, -@@ -238,6 +381,7 @@ static void update_sysctl(void) - #define SET_SYSCTL(name) \ - (sysctl_##name = (factor) * normalized_sysctl_##name) - SET_SYSCTL(sched_base_slice); -+ SET_SYSCTL(sched_wakeup_granularity); - #undef SET_SYSCTL - } - -@@ -308,11 +452,20 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight +@@ -308,11 +444,22 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight /* * delta /= w */ +#ifdef CONFIG_SCHED_BORE -+#define calc_delta_fair_half(delta, se) __calc_delta_fair(delta, se, true) ++#define bore_start_debit_full_penalty (sched_bore_extra_flags) ++#define calc_delta_fair_debit(delta, se) \ ++ __calc_delta_fair(delta, se, !bore_start_debit_full_penalty) +#define calc_delta_fair(delta, se) __calc_delta_fair(delta, se, false) +static inline u64 __calc_delta_fair(u64 delta, struct sched_entity *se, bool half) +#else // CONFIG_SCHED_BORE @@ -496,15 +489,7 @@ index 461409c0e..1293fe037 100644 return delta; } -@@ -944,6 +1097,7 @@ int sched_update_scaling(void) - #define WRT_SYSCTL(name) \ - (normalized_sysctl_##name = sysctl_##name / (factor)) - WRT_SYSCTL(sched_base_slice); -+ WRT_SYSCTL(sched_wakeup_granularity); - #undef WRT_SYSCTL - - return 0; -@@ -1121,7 +1275,11 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1128,7 +1275,11 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); @@ -517,76 +502,19 @@ index 461409c0e..1293fe037 100644 update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -4919,7 +5077,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} +@@ -4926,7 +5077,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { -- u64 vslice = calc_delta_fair(se->slice, se); -+ u64 vslice = calc_delta_fair_half(se->slice, se); ++#ifdef CONFIG_SCHED_BORE ++ u64 vslice = calc_delta_fair_debit(se->slice, se); ++#else // CONFIG_SCHED_BORE + u64 vslice = calc_delta_fair(se->slice, se); ++#endif // CONFIG_SCHED_BORE u64 vruntime = avg_vruntime(cfs_rq); s64 lag = 0; -@@ -5187,6 +5345,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - se->prev_sum_exec_runtime = se->sum_exec_runtime; - } - -+static int -+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); -+ - /* - * Pick the next process, keeping these things in mind, in this order: - * 1) keep things fair between processes/task groups -@@ -5197,14 +5358,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) - static struct sched_entity * - pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) - { -+ struct sched_entity *candidate = pick_eevdf(cfs_rq); - /* - * Enabling NEXT_BUDDY will affect latency but not fairness. - */ - if (sched_feat(NEXT_BUDDY) && -- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next) && -+ wakeup_preempt_entity(cfs_rq->next, candidate) < 1) - return cfs_rq->next; - -- return pick_eevdf(cfs_rq); -+ return candidate; - } - - static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -6452,6 +6615,30 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) - hrtick_update(rq); - } - -+static unsigned long wakeup_gran(struct sched_entity *se) -+{ -+ unsigned long gran = sysctl_sched_wakeup_granularity; -+ return calc_delta_fair(gran, se); -+} -+ -+static int -+wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) -+{ -+ s64 gran, vdiff = curr->vruntime - se->vruntime; -+#ifdef CONFIG_SCHED_BORE -+ if (likely(sched_bore)) vruntime_backstep(&vdiff, curr); -+#endif // CONFIG_SCHED_BORE -+ -+ if (vdiff <= 0) -+ return -1; -+ -+ gran = wakeup_gran(se); -+ if (vdiff > gran) -+ return 1; -+ -+ return 0; -+} -+ - static void set_next_buddy(struct sched_entity *se); - - /* -@@ -6470,6 +6657,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6482,6 +6637,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -596,16 +524,7 @@ index 461409c0e..1293fe037 100644 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); -@@ -7980,7 +8170,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - /* - * XXX pick_eevdf(cfs_rq) != se ? - */ -- if (pick_eevdf(cfs_rq) == pse) -+ if ((pick_eevdf(cfs_rq) == pse) && (wakeup_preempt_entity(se, pse) == 1)) - goto preempt; - - return; -@@ -8197,8 +8387,12 @@ static void yield_task_fair(struct rq *rq) +@@ -8206,8 +8364,12 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -619,7 +538,7 @@ index 461409c0e..1293fe037 100644 clear_buddies(cfs_rq, se); -@@ -8207,6 +8401,9 @@ static void yield_task_fair(struct rq *rq) +@@ -8216,6 +8378,9 @@ static void yield_task_fair(struct rq *rq) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); @@ -629,17 +548,5 @@ index 461409c0e..1293fe037 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 67cd7e1fd..04d065015 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2506,6 +2506,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; - extern const_debug unsigned int sysctl_sched_migration_cost; - - extern unsigned int sysctl_sched_base_slice; -+extern unsigned int sysctl_sched_wakeup_granularity; - - #ifdef CONFIG_SCHED_DEBUG - extern int sysctl_resched_latency_warn_ms; -- -2.42.0.rc0.25.ga82fb66fed +2.42.0 diff --git a/patches/0003-bcachefs.patch b/patches/0003-bcachefs.patch deleted file mode 100644 index 5bee813..0000000 --- a/patches/0003-bcachefs.patch +++ /dev/null @@ -1,98945 +0,0 @@ -From 31f38fa87a86e086ffcc063e7e24702064eda50f Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Tue, 29 Aug 2023 12:14:18 +0200 -Subject: [PATCH] bcachefs - -Signed-off-by: Piotr Gorski ---- - MAINTAINERS | 32 + - arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- - block/bio.c | 18 +- - block/blk-core.c | 1 + - block/blk.h | 1 - - drivers/accel/ivpu/ivpu_gem.c | 8 +- - drivers/accel/ivpu/ivpu_gem.h | 2 +- - drivers/block/virtio_blk.c | 4 +- - drivers/gpu/drm/gud/gud_drv.c | 2 +- - drivers/md/bcache/Kconfig | 10 +- - drivers/md/bcache/Makefile | 4 +- - drivers/md/bcache/bcache.h | 2 +- - drivers/md/bcache/super.c | 1 - - drivers/md/bcache/util.h | 3 +- - drivers/mmc/core/block.c | 4 +- - drivers/mtd/spi-nor/debugfs.c | 6 +- - .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 4 +- - drivers/scsi/sd.c | 8 +- - fs/Kconfig | 1 + - fs/Makefile | 1 + - fs/aio.c | 66 +- - fs/bcachefs/Kconfig | 76 + - fs/bcachefs/Makefile | 83 + - fs/bcachefs/acl.c | 412 +++ - fs/bcachefs/acl.h | 58 + - fs/bcachefs/alloc_background.c | 2157 +++++++++++ - fs/bcachefs/alloc_background.h | 257 ++ - fs/bcachefs/alloc_foreground.c | 1571 ++++++++ - fs/bcachefs/alloc_foreground.h | 224 ++ - fs/bcachefs/alloc_types.h | 126 + - fs/bcachefs/backpointers.c | 873 +++++ - fs/bcachefs/backpointers.h | 131 + - fs/bcachefs/bbpos.h | 48 + - fs/bcachefs/bcachefs.h | 1146 ++++++ - fs/bcachefs/bcachefs_format.h | 2368 ++++++++++++ - fs/bcachefs/bcachefs_ioctl.h | 368 ++ - fs/bcachefs/bkey.c | 1107 ++++++ - fs/bcachefs/bkey.h | 782 ++++ - fs/bcachefs/bkey_buf.h | 61 + - fs/bcachefs/bkey_cmp.h | 129 + - fs/bcachefs/bkey_methods.c | 456 +++ - fs/bcachefs/bkey_methods.h | 188 + - fs/bcachefs/bkey_sort.c | 201 ++ - fs/bcachefs/bkey_sort.h | 44 + - fs/bcachefs/bset.c | 1587 ++++++++ - fs/bcachefs/bset.h | 541 +++ - fs/bcachefs/btree_cache.c | 1274 +++++++ - fs/bcachefs/btree_cache.h | 130 + - fs/bcachefs/btree_gc.c | 2127 +++++++++++ - fs/bcachefs/btree_gc.h | 114 + - fs/bcachefs/btree_io.c | 2245 ++++++++++++ - fs/bcachefs/btree_io.h | 228 ++ - fs/bcachefs/btree_iter.c | 3194 +++++++++++++++++ - fs/bcachefs/btree_iter.h | 940 +++++ - fs/bcachefs/btree_journal_iter.c | 531 +++ - fs/bcachefs/btree_journal_iter.h | 57 + - fs/bcachefs/btree_key_cache.c | 1088 ++++++ - fs/bcachefs/btree_key_cache.h | 48 + - fs/bcachefs/btree_locking.c | 797 ++++ - fs/bcachefs/btree_locking.h | 423 +++ - fs/bcachefs/btree_trans_commit.c | 1156 ++++++ - fs/bcachefs/btree_types.h | 746 ++++ - fs/bcachefs/btree_update.c | 898 +++++ - fs/bcachefs/btree_update.h | 353 ++ - fs/bcachefs/btree_update_interior.c | 2488 +++++++++++++ - fs/bcachefs/btree_update_interior.h | 337 ++ - fs/bcachefs/btree_write_buffer.c | 375 ++ - fs/bcachefs/btree_write_buffer.h | 14 + - fs/bcachefs/btree_write_buffer_types.h | 44 + - fs/bcachefs/buckets.c | 2107 +++++++++++ - fs/bcachefs/buckets.h | 413 +++ - fs/bcachefs/buckets_types.h | 92 + - fs/bcachefs/buckets_waiting_for_journal.c | 166 + - fs/bcachefs/buckets_waiting_for_journal.h | 15 + - .../buckets_waiting_for_journal_types.h | 23 + - fs/bcachefs/chardev.c | 769 ++++ - fs/bcachefs/chardev.h | 31 + - fs/bcachefs/checksum.c | 753 ++++ - fs/bcachefs/checksum.h | 211 ++ - fs/bcachefs/clock.c | 193 + - fs/bcachefs/clock.h | 38 + - fs/bcachefs/clock_types.h | 37 + - fs/bcachefs/compress.c | 714 ++++ - fs/bcachefs/compress.h | 55 + - fs/bcachefs/counters.c | 107 + - fs/bcachefs/counters.h | 17 + - fs/bcachefs/darray.h | 87 + - fs/bcachefs/data_update.c | 562 +++ - fs/bcachefs/data_update.h | 43 + - fs/bcachefs/debug.c | 957 +++++ - fs/bcachefs/debug.h | 32 + - fs/bcachefs/dirent.c | 590 +++ - fs/bcachefs/dirent.h | 70 + - fs/bcachefs/disk_groups.c | 556 +++ - fs/bcachefs/disk_groups.h | 106 + - fs/bcachefs/ec.c | 1972 ++++++++++ - fs/bcachefs/ec.h | 260 ++ - fs/bcachefs/ec_types.h | 41 + - fs/bcachefs/errcode.c | 63 + - fs/bcachefs/errcode.h | 252 ++ - fs/bcachefs/error.c | 294 ++ - fs/bcachefs/error.h | 206 ++ - fs/bcachefs/extent_update.c | 173 + - fs/bcachefs/extent_update.h | 12 + - fs/bcachefs/extents.c | 1403 ++++++++ - fs/bcachefs/extents.h | 757 ++++ - fs/bcachefs/extents_types.h | 40 + - fs/bcachefs/eytzinger.h | 281 ++ - fs/bcachefs/fifo.h | 127 + - fs/bcachefs/fs-common.c | 501 +++ - fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io-buffered.c | 1099 ++++++ - fs/bcachefs/fs-io-buffered.h | 27 + - fs/bcachefs/fs-io-direct.c | 679 ++++ - fs/bcachefs/fs-io-direct.h | 16 + - fs/bcachefs/fs-io-pagecache.c | 788 ++++ - fs/bcachefs/fs-io-pagecache.h | 176 + - fs/bcachefs/fs-io.c | 1250 +++++++ - fs/bcachefs/fs-io.h | 184 + - fs/bcachefs/fs-ioctl.c | 559 +++ - fs/bcachefs/fs-ioctl.h | 81 + - fs/bcachefs/fs.c | 1961 ++++++++++ - fs/bcachefs/fs.h | 209 ++ - fs/bcachefs/fsck.c | 2483 +++++++++++++ - fs/bcachefs/fsck.h | 14 + - fs/bcachefs/inode.c | 1111 ++++++ - fs/bcachefs/inode.h | 204 ++ - fs/bcachefs/io.c | 3051 ++++++++++++++++ - fs/bcachefs/io.h | 202 ++ - fs/bcachefs/io_types.h | 165 + - fs/bcachefs/journal.c | 1438 ++++++++ - fs/bcachefs/journal.h | 526 +++ - fs/bcachefs/journal_io.c | 1888 ++++++++++ - fs/bcachefs/journal_io.h | 65 + - fs/bcachefs/journal_reclaim.c | 874 +++++ - fs/bcachefs/journal_reclaim.h | 86 + - fs/bcachefs/journal_sb.c | 219 ++ - fs/bcachefs/journal_sb.h | 24 + - fs/bcachefs/journal_seq_blacklist.c | 322 ++ - fs/bcachefs/journal_seq_blacklist.h | 22 + - fs/bcachefs/journal_types.h | 345 ++ - fs/bcachefs/keylist.c | 52 + - fs/bcachefs/keylist.h | 74 + - fs/bcachefs/keylist_types.h | 16 + - fs/bcachefs/lru.c | 162 + - fs/bcachefs/lru.h | 69 + - fs/bcachefs/migrate.c | 182 + - fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 1162 ++++++ - fs/bcachefs/move.h | 95 + - fs/bcachefs/move_types.h | 36 + - fs/bcachefs/movinggc.c | 423 +++ - fs/bcachefs/movinggc.h | 12 + - fs/bcachefs/nocow_locking.c | 123 + - fs/bcachefs/nocow_locking.h | 49 + - fs/bcachefs/nocow_locking_types.h | 20 + - fs/bcachefs/opts.c | 599 ++++ - fs/bcachefs/opts.h | 563 +++ - fs/bcachefs/printbuf.c | 415 +++ - fs/bcachefs/printbuf.h | 284 ++ - fs/bcachefs/quota.c | 981 +++++ - fs/bcachefs/quota.h | 74 + - fs/bcachefs/quota_types.h | 43 + - fs/bcachefs/rebalance.c | 368 ++ - fs/bcachefs/rebalance.h | 28 + - fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1057 ++++++ - fs/bcachefs/recovery.h | 33 + - fs/bcachefs/recovery_types.h | 48 + - fs/bcachefs/reflink.c | 399 ++ - fs/bcachefs/reflink.h | 81 + - fs/bcachefs/replicas.c | 1059 ++++++ - fs/bcachefs/replicas.h | 91 + - fs/bcachefs/replicas_types.h | 27 + - fs/bcachefs/sb-clean.c | 395 ++ - fs/bcachefs/sb-clean.h | 16 + - fs/bcachefs/sb-members.c | 173 + - fs/bcachefs/sb-members.h | 176 + - fs/bcachefs/seqmutex.h | 48 + - fs/bcachefs/siphash.c | 173 + - fs/bcachefs/siphash.h | 87 + - fs/bcachefs/six.c | 918 +++++ - fs/bcachefs/six.h | 388 ++ - fs/bcachefs/snapshot.c | 1687 +++++++++ - fs/bcachefs/snapshot.h | 272 ++ - fs/bcachefs/str_hash.h | 370 ++ - fs/bcachefs/subvolume.c | 451 +++ - fs/bcachefs/subvolume.h | 35 + - fs/bcachefs/subvolume_types.h | 31 + - fs/bcachefs/super-io.c | 1265 +++++++ - fs/bcachefs/super-io.h | 133 + - fs/bcachefs/super.c | 2015 +++++++++++ - fs/bcachefs/super.h | 52 + - fs/bcachefs/super_types.h | 52 + - fs/bcachefs/sysfs.c | 1059 ++++++ - fs/bcachefs/sysfs.h | 48 + - fs/bcachefs/tests.c | 970 +++++ - fs/bcachefs/tests.h | 15 + - fs/bcachefs/trace.c | 16 + - fs/bcachefs/trace.h | 1265 +++++++ - fs/bcachefs/two_state_shared_lock.c | 8 + - fs/bcachefs/two_state_shared_lock.h | 59 + - fs/bcachefs/util.c | 1144 ++++++ - fs/bcachefs/util.h | 851 +++++ - fs/bcachefs/varint.c | 123 + - fs/bcachefs/varint.h | 11 + - fs/bcachefs/vstructs.h | 63 + - fs/bcachefs/xattr.c | 649 ++++ - fs/bcachefs/xattr.h | 50 + - fs/dcache.c | 12 +- - fs/inode.c | 218 +- - fs/iomap/buffered-io.c | 45 +- - fs/xfs/xfs_iomap.c | 3 + - fs/xfs/xfs_mount.h | 2 + - fs/xfs/xfs_super.c | 6 +- - include/linux/bio.h | 7 +- - include/linux/blkdev.h | 1 + - .../md/bcache => include/linux}/closure.h | 46 +- - include/linux/dcache.h | 1 + - include/linux/exportfs.h | 6 + - include/linux/fs.h | 15 +- - include/linux/generic-radix-tree.h | 68 +- - include/linux/gfp_types.h | 90 +- - include/linux/iomap.h | 1 + - include/linux/list_bl.h | 22 + - include/linux/lockdep.h | 10 + - include/linux/lockdep_types.h | 2 +- - include/linux/mean_and_variance.h | 198 + - include/linux/nodemask.h | 2 +- - include/linux/nodemask_types.h | 9 + - include/linux/prandom.h | 1 - - include/linux/sched.h | 5 +- - include/linux/seq_buf.h | 2 + - include/linux/shrinker.h | 9 +- - include/linux/string_helpers.h | 13 +- - init/init_task.c | 1 + - kernel/locking/lockdep.c | 46 + - kernel/locking/mutex.c | 3 + - kernel/locking/osq_lock.c | 2 + - kernel/stacktrace.c | 2 + - lib/Kconfig | 3 + - lib/Kconfig.debug | 18 + - lib/Makefile | 2 + - {drivers/md/bcache => lib}/closure.c | 41 +- - lib/errname.c | 1 + - lib/generic-radix-tree.c | 76 +- - lib/iov_iter.c | 43 +- - lib/math/Kconfig | 3 + - lib/math/Makefile | 2 + - lib/math/mean_and_variance.c | 158 + - lib/math/mean_and_variance_test.c | 239 ++ - lib/rhashtable.c | 9 +- - lib/seq_buf.c | 10 + - lib/string_helpers.c | 26 +- - lib/test-string_helpers.c | 4 +- - mm/hugetlb.c | 8 +- - mm/madvise.c | 61 + - mm/oom_kill.c | 23 - - mm/show_mem.c | 22 + - mm/slab.h | 6 +- - mm/slab_common.c | 52 +- - mm/vmscan.c | 99 +- - scripts/Kbuild.include | 10 + - scripts/Makefile.lib | 2 +- - scripts/kallsyms.c | 13 + - 265 files changed, 95211 insertions(+), 312 deletions(-) - create mode 100644 fs/bcachefs/Kconfig - create mode 100644 fs/bcachefs/Makefile - create mode 100644 fs/bcachefs/acl.c - create mode 100644 fs/bcachefs/acl.h - create mode 100644 fs/bcachefs/alloc_background.c - create mode 100644 fs/bcachefs/alloc_background.h - create mode 100644 fs/bcachefs/alloc_foreground.c - create mode 100644 fs/bcachefs/alloc_foreground.h - create mode 100644 fs/bcachefs/alloc_types.h - create mode 100644 fs/bcachefs/backpointers.c - create mode 100644 fs/bcachefs/backpointers.h - create mode 100644 fs/bcachefs/bbpos.h - create mode 100644 fs/bcachefs/bcachefs.h - create mode 100644 fs/bcachefs/bcachefs_format.h - create mode 100644 fs/bcachefs/bcachefs_ioctl.h - create mode 100644 fs/bcachefs/bkey.c - create mode 100644 fs/bcachefs/bkey.h - create mode 100644 fs/bcachefs/bkey_buf.h - create mode 100644 fs/bcachefs/bkey_cmp.h - create mode 100644 fs/bcachefs/bkey_methods.c - create mode 100644 fs/bcachefs/bkey_methods.h - create mode 100644 fs/bcachefs/bkey_sort.c - create mode 100644 fs/bcachefs/bkey_sort.h - create mode 100644 fs/bcachefs/bset.c - create mode 100644 fs/bcachefs/bset.h - create mode 100644 fs/bcachefs/btree_cache.c - create mode 100644 fs/bcachefs/btree_cache.h - create mode 100644 fs/bcachefs/btree_gc.c - create mode 100644 fs/bcachefs/btree_gc.h - create mode 100644 fs/bcachefs/btree_io.c - create mode 100644 fs/bcachefs/btree_io.h - create mode 100644 fs/bcachefs/btree_iter.c - create mode 100644 fs/bcachefs/btree_iter.h - create mode 100644 fs/bcachefs/btree_journal_iter.c - create mode 100644 fs/bcachefs/btree_journal_iter.h - create mode 100644 fs/bcachefs/btree_key_cache.c - create mode 100644 fs/bcachefs/btree_key_cache.h - create mode 100644 fs/bcachefs/btree_locking.c - create mode 100644 fs/bcachefs/btree_locking.h - create mode 100644 fs/bcachefs/btree_trans_commit.c - create mode 100644 fs/bcachefs/btree_types.h - create mode 100644 fs/bcachefs/btree_update.c - create mode 100644 fs/bcachefs/btree_update.h - create mode 100644 fs/bcachefs/btree_update_interior.c - create mode 100644 fs/bcachefs/btree_update_interior.h - create mode 100644 fs/bcachefs/btree_write_buffer.c - create mode 100644 fs/bcachefs/btree_write_buffer.h - create mode 100644 fs/bcachefs/btree_write_buffer_types.h - create mode 100644 fs/bcachefs/buckets.c - create mode 100644 fs/bcachefs/buckets.h - create mode 100644 fs/bcachefs/buckets_types.h - create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c - create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h - create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h - create mode 100644 fs/bcachefs/chardev.c - create mode 100644 fs/bcachefs/chardev.h - create mode 100644 fs/bcachefs/checksum.c - create mode 100644 fs/bcachefs/checksum.h - create mode 100644 fs/bcachefs/clock.c - create mode 100644 fs/bcachefs/clock.h - create mode 100644 fs/bcachefs/clock_types.h - create mode 100644 fs/bcachefs/compress.c - create mode 100644 fs/bcachefs/compress.h - create mode 100644 fs/bcachefs/counters.c - create mode 100644 fs/bcachefs/counters.h - create mode 100644 fs/bcachefs/darray.h - create mode 100644 fs/bcachefs/data_update.c - create mode 100644 fs/bcachefs/data_update.h - create mode 100644 fs/bcachefs/debug.c - create mode 100644 fs/bcachefs/debug.h - create mode 100644 fs/bcachefs/dirent.c - create mode 100644 fs/bcachefs/dirent.h - create mode 100644 fs/bcachefs/disk_groups.c - create mode 100644 fs/bcachefs/disk_groups.h - create mode 100644 fs/bcachefs/ec.c - create mode 100644 fs/bcachefs/ec.h - create mode 100644 fs/bcachefs/ec_types.h - create mode 100644 fs/bcachefs/errcode.c - create mode 100644 fs/bcachefs/errcode.h - create mode 100644 fs/bcachefs/error.c - create mode 100644 fs/bcachefs/error.h - create mode 100644 fs/bcachefs/extent_update.c - create mode 100644 fs/bcachefs/extent_update.h - create mode 100644 fs/bcachefs/extents.c - create mode 100644 fs/bcachefs/extents.h - create mode 100644 fs/bcachefs/extents_types.h - create mode 100644 fs/bcachefs/eytzinger.h - create mode 100644 fs/bcachefs/fifo.h - create mode 100644 fs/bcachefs/fs-common.c - create mode 100644 fs/bcachefs/fs-common.h - create mode 100644 fs/bcachefs/fs-io-buffered.c - create mode 100644 fs/bcachefs/fs-io-buffered.h - create mode 100644 fs/bcachefs/fs-io-direct.c - create mode 100644 fs/bcachefs/fs-io-direct.h - create mode 100644 fs/bcachefs/fs-io-pagecache.c - create mode 100644 fs/bcachefs/fs-io-pagecache.h - create mode 100644 fs/bcachefs/fs-io.c - create mode 100644 fs/bcachefs/fs-io.h - create mode 100644 fs/bcachefs/fs-ioctl.c - create mode 100644 fs/bcachefs/fs-ioctl.h - create mode 100644 fs/bcachefs/fs.c - create mode 100644 fs/bcachefs/fs.h - create mode 100644 fs/bcachefs/fsck.c - create mode 100644 fs/bcachefs/fsck.h - create mode 100644 fs/bcachefs/inode.c - create mode 100644 fs/bcachefs/inode.h - create mode 100644 fs/bcachefs/io.c - create mode 100644 fs/bcachefs/io.h - create mode 100644 fs/bcachefs/io_types.h - create mode 100644 fs/bcachefs/journal.c - create mode 100644 fs/bcachefs/journal.h - create mode 100644 fs/bcachefs/journal_io.c - create mode 100644 fs/bcachefs/journal_io.h - create mode 100644 fs/bcachefs/journal_reclaim.c - create mode 100644 fs/bcachefs/journal_reclaim.h - create mode 100644 fs/bcachefs/journal_sb.c - create mode 100644 fs/bcachefs/journal_sb.h - create mode 100644 fs/bcachefs/journal_seq_blacklist.c - create mode 100644 fs/bcachefs/journal_seq_blacklist.h - create mode 100644 fs/bcachefs/journal_types.h - create mode 100644 fs/bcachefs/keylist.c - create mode 100644 fs/bcachefs/keylist.h - create mode 100644 fs/bcachefs/keylist_types.h - create mode 100644 fs/bcachefs/lru.c - create mode 100644 fs/bcachefs/lru.h - create mode 100644 fs/bcachefs/migrate.c - create mode 100644 fs/bcachefs/migrate.h - create mode 100644 fs/bcachefs/move.c - create mode 100644 fs/bcachefs/move.h - create mode 100644 fs/bcachefs/move_types.h - create mode 100644 fs/bcachefs/movinggc.c - create mode 100644 fs/bcachefs/movinggc.h - create mode 100644 fs/bcachefs/nocow_locking.c - create mode 100644 fs/bcachefs/nocow_locking.h - create mode 100644 fs/bcachefs/nocow_locking_types.h - create mode 100644 fs/bcachefs/opts.c - create mode 100644 fs/bcachefs/opts.h - create mode 100644 fs/bcachefs/printbuf.c - create mode 100644 fs/bcachefs/printbuf.h - create mode 100644 fs/bcachefs/quota.c - create mode 100644 fs/bcachefs/quota.h - create mode 100644 fs/bcachefs/quota_types.h - create mode 100644 fs/bcachefs/rebalance.c - create mode 100644 fs/bcachefs/rebalance.h - create mode 100644 fs/bcachefs/rebalance_types.h - create mode 100644 fs/bcachefs/recovery.c - create mode 100644 fs/bcachefs/recovery.h - create mode 100644 fs/bcachefs/recovery_types.h - create mode 100644 fs/bcachefs/reflink.c - create mode 100644 fs/bcachefs/reflink.h - create mode 100644 fs/bcachefs/replicas.c - create mode 100644 fs/bcachefs/replicas.h - create mode 100644 fs/bcachefs/replicas_types.h - create mode 100644 fs/bcachefs/sb-clean.c - create mode 100644 fs/bcachefs/sb-clean.h - create mode 100644 fs/bcachefs/sb-members.c - create mode 100644 fs/bcachefs/sb-members.h - create mode 100644 fs/bcachefs/seqmutex.h - create mode 100644 fs/bcachefs/siphash.c - create mode 100644 fs/bcachefs/siphash.h - create mode 100644 fs/bcachefs/six.c - create mode 100644 fs/bcachefs/six.h - create mode 100644 fs/bcachefs/snapshot.c - create mode 100644 fs/bcachefs/snapshot.h - create mode 100644 fs/bcachefs/str_hash.h - create mode 100644 fs/bcachefs/subvolume.c - create mode 100644 fs/bcachefs/subvolume.h - create mode 100644 fs/bcachefs/subvolume_types.h - create mode 100644 fs/bcachefs/super-io.c - create mode 100644 fs/bcachefs/super-io.h - create mode 100644 fs/bcachefs/super.c - create mode 100644 fs/bcachefs/super.h - create mode 100644 fs/bcachefs/super_types.h - create mode 100644 fs/bcachefs/sysfs.c - create mode 100644 fs/bcachefs/sysfs.h - create mode 100644 fs/bcachefs/tests.c - create mode 100644 fs/bcachefs/tests.h - create mode 100644 fs/bcachefs/trace.c - create mode 100644 fs/bcachefs/trace.h - create mode 100644 fs/bcachefs/two_state_shared_lock.c - create mode 100644 fs/bcachefs/two_state_shared_lock.h - create mode 100644 fs/bcachefs/util.c - create mode 100644 fs/bcachefs/util.h - create mode 100644 fs/bcachefs/varint.c - create mode 100644 fs/bcachefs/varint.h - create mode 100644 fs/bcachefs/vstructs.h - create mode 100644 fs/bcachefs/xattr.c - create mode 100644 fs/bcachefs/xattr.h - rename {drivers/md/bcache => include/linux}/closure.h (93%) - create mode 100644 include/linux/mean_and_variance.h - create mode 100644 include/linux/nodemask_types.h - rename {drivers/md/bcache => lib}/closure.c (85%) - create mode 100644 lib/math/mean_and_variance.c - create mode 100644 lib/math/mean_and_variance_test.c - -diff --git a/MAINTAINERS b/MAINTAINERS -index 4cc6bf79f..9c7fa5956 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -3458,6 +3458,14 @@ W: http://bcache.evilpiepirate.org - C: irc://irc.oftc.net/bcache - F: drivers/md/bcache/ - -+BCACHEFS -+M: Kent Overstreet -+R: Brian Foster -+L: linux-bcachefs@vger.kernel.org -+S: Supported -+C: irc://irc.oftc.net/bcache -+F: fs/bcachefs/ -+ - BDISP ST MEDIA DRIVER - M: Fabien Dessenne - L: linux-media@vger.kernel.org -@@ -5027,6 +5035,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core - F: Documentation/devicetree/bindings/timer/ - F: drivers/clocksource/ - -+CLOSURES -+M: Kent Overstreet -+L: linux-bcachefs@vger.kernel.org -+S: Supported -+C: irc://irc.oftc.net/bcache -+F: include/linux/closure.h -+F: lib/closure.c -+ - CMPC ACPI DRIVER - M: Thadeu Lima de Souza Cascardo - M: Daniel Oliveira Nascimento -@@ -8673,6 +8689,13 @@ F: Documentation/devicetree/bindings/power/power?domain* - F: drivers/base/power/domain*.c - F: include/linux/pm_domain.h - -+GENERIC RADIX TREE -+M: Kent Overstreet -+S: Supported -+C: irc://irc.oftc.net/bcache -+F: include/linux/generic-radix-tree.h -+F: lib/generic-radix-tree.c -+ - GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER - M: Eugen Hristev - L: linux-input@vger.kernel.org -@@ -12925,6 +12948,15 @@ S: Maintained - F: drivers/net/mdio/mdio-regmap.c - F: include/linux/mdio/mdio-regmap.h - -+MEAN AND VARIANCE LIBRARY -+M: Daniel B. Hill -+M: Kent Overstreet -+S: Maintained -+T: git https://github.com/YellowOnion/linux/ -+F: include/linux/mean_and_variance.h -+F: lib/math/mean_and_variance.c -+F: lib/math/mean_and_variance_test.c -+ - MEASUREMENT COMPUTING CIO-DAC IIO DRIVER - M: William Breathitt Gray - L: linux-iio@vger.kernel.org -diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c -index e7ea492ac..5936205bf 100644 ---- a/arch/powerpc/mm/book3s64/radix_pgtable.c -+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c -@@ -261,7 +261,7 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e - if (end <= start) - return; - -- string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); -+ string_get_size(size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); - - pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, - exec ? " (exec)" : ""); -diff --git a/block/bio.c b/block/bio.c -index 867217921..425b3da39 100644 ---- a/block/bio.c -+++ b/block/bio.c -@@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) - } - EXPORT_SYMBOL(bio_kmalloc); - --void zero_fill_bio(struct bio *bio) -+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) - { - struct bio_vec bv; - struct bvec_iter iter; - -- bio_for_each_segment(bv, bio, iter) -+ __bio_for_each_segment(bv, bio, iter, start) - memzero_bvec(&bv); - } --EXPORT_SYMBOL(zero_fill_bio); -+EXPORT_SYMBOL(zero_fill_bio_iter); - - /** - * bio_truncate - truncate the bio to small size of @new_size -@@ -1252,7 +1252,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) - struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i = 0; -- size_t offset, trim; -+ size_t offset; - int ret = 0; - - /* -@@ -1281,10 +1281,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) - - nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - -- trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); -- iov_iter_revert(iter, trim); -+ if (bio->bi_bdev) { -+ size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); -+ iov_iter_revert(iter, trim); -+ size -= trim; -+ } - -- size -= trim; - if (unlikely(!size)) { - ret = -EFAULT; - goto out; -@@ -1490,6 +1492,7 @@ void bio_set_pages_dirty(struct bio *bio) - set_page_dirty_lock(bvec->bv_page); - } - } -+EXPORT_SYMBOL_GPL(bio_set_pages_dirty); - - /* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1549,6 +1552,7 @@ void bio_check_pages_dirty(struct bio *bio) - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } -+EXPORT_SYMBOL_GPL(bio_check_pages_dirty); - - static inline bool bio_remaining_done(struct bio *bio) - { -diff --git a/block/blk-core.c b/block/blk-core.c -index 9866468c7..9d51e9894 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -208,6 +208,7 @@ const char *blk_status_to_str(blk_status_t status) - return ""; - return blk_errors[idx].name; - } -+EXPORT_SYMBOL_GPL(blk_status_to_str); - - /** - * blk_sync_queue - cancel any pending callbacks on a queue -diff --git a/block/blk.h b/block/blk.h -index 608c5dcc5..47e03fc44 100644 ---- a/block/blk.h -+++ b/block/blk.h -@@ -251,7 +251,6 @@ static inline void bio_integrity_free(struct bio *bio) - - unsigned long blk_rq_timeout(unsigned long timeout); - void blk_add_timer(struct request *req); --const char *blk_status_to_str(blk_status_t status); - - bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs); -diff --git a/drivers/accel/ivpu/ivpu_gem.c b/drivers/accel/ivpu/ivpu_gem.c -index 9967fcfa2..4e8122fb6 100644 ---- a/drivers/accel/ivpu/ivpu_gem.c -+++ b/drivers/accel/ivpu/ivpu_gem.c -@@ -61,7 +61,7 @@ static void prime_unmap_pages_locked(struct ivpu_bo *bo) - static const struct ivpu_bo_ops prime_ops = { - .type = IVPU_BO_TYPE_PRIME, - .name = "prime", -- .alloc_pages = prime_alloc_pages_locked, -+ .alloc_pages_op = prime_alloc_pages_locked, - .free_pages = prime_free_pages_locked, - .map_pages = prime_map_pages_locked, - .unmap_pages = prime_unmap_pages_locked, -@@ -134,7 +134,7 @@ static void ivpu_bo_unmap_pages_locked(struct ivpu_bo *bo) - static const struct ivpu_bo_ops shmem_ops = { - .type = IVPU_BO_TYPE_SHMEM, - .name = "shmem", -- .alloc_pages = shmem_alloc_pages_locked, -+ .alloc_pages_op = shmem_alloc_pages_locked, - .free_pages = shmem_free_pages_locked, - .map_pages = ivpu_bo_map_pages_locked, - .unmap_pages = ivpu_bo_unmap_pages_locked, -@@ -186,7 +186,7 @@ static void internal_free_pages_locked(struct ivpu_bo *bo) - static const struct ivpu_bo_ops internal_ops = { - .type = IVPU_BO_TYPE_INTERNAL, - .name = "internal", -- .alloc_pages = internal_alloc_pages_locked, -+ .alloc_pages_op = internal_alloc_pages_locked, - .free_pages = internal_free_pages_locked, - .map_pages = ivpu_bo_map_pages_locked, - .unmap_pages = ivpu_bo_unmap_pages_locked, -@@ -200,7 +200,7 @@ static int __must_check ivpu_bo_alloc_and_map_pages_locked(struct ivpu_bo *bo) - lockdep_assert_held(&bo->lock); - drm_WARN_ON(&vdev->drm, bo->sgt); - -- ret = bo->ops->alloc_pages(bo); -+ ret = bo->ops->alloc_pages_op(bo); - if (ret) { - ivpu_err(vdev, "Failed to allocate pages for BO: %d", ret); - return ret; -diff --git a/drivers/accel/ivpu/ivpu_gem.h b/drivers/accel/ivpu/ivpu_gem.h -index 6b0ceda5f..b81cf2af0 100644 ---- a/drivers/accel/ivpu/ivpu_gem.h -+++ b/drivers/accel/ivpu/ivpu_gem.h -@@ -42,7 +42,7 @@ enum ivpu_bo_type { - struct ivpu_bo_ops { - enum ivpu_bo_type type; - const char *name; -- int (*alloc_pages)(struct ivpu_bo *bo); -+ int (*alloc_pages_op)(struct ivpu_bo *bo); - void (*free_pages)(struct ivpu_bo *bo); - int (*map_pages)(struct ivpu_bo *bo); - void (*unmap_pages)(struct ivpu_bo *bo); -diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c -index 1fe011676..59140424d 100644 ---- a/drivers/block/virtio_blk.c -+++ b/drivers/block/virtio_blk.c -@@ -986,9 +986,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) - nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); - - string_get_size(nblocks, queue_logical_block_size(q), -- STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); -+ STRING_SIZE_BASE2, cap_str_2, sizeof(cap_str_2)); - string_get_size(nblocks, queue_logical_block_size(q), -- STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); -+ 0, cap_str_10, sizeof(cap_str_10)); - - dev_notice(&vdev->dev, - "[%s] %s%llu %d-byte logical blocks (%s/%s)\n", -diff --git a/drivers/gpu/drm/gud/gud_drv.c b/drivers/gpu/drm/gud/gud_drv.c -index 9d7bf8ee4..6b1748e1f 100644 ---- a/drivers/gpu/drm/gud/gud_drv.c -+++ b/drivers/gpu/drm/gud/gud_drv.c -@@ -329,7 +329,7 @@ static int gud_stats_debugfs(struct seq_file *m, void *data) - struct gud_device *gdrm = to_gud_device(entry->dev); - char buf[10]; - -- string_get_size(gdrm->bulk_len, 1, STRING_UNITS_2, buf, sizeof(buf)); -+ string_get_size(gdrm->bulk_len, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); - seq_printf(m, "Max buffer size: %s\n", buf); - seq_printf(m, "Number of errors: %u\n", gdrm->stats_num_errors); - -diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig -index 529c9d04e..b2d10063d 100644 ---- a/drivers/md/bcache/Kconfig -+++ b/drivers/md/bcache/Kconfig -@@ -4,6 +4,7 @@ config BCACHE - tristate "Block device as cache" - select BLOCK_HOLDER_DEPRECATED if SYSFS - select CRC64 -+ select CLOSURES - help - Allows a block device to be used as cache for other devices; uses - a btree for indexing and the layout is optimized for SSDs. -@@ -19,15 +20,6 @@ config BCACHE_DEBUG - Enables extra debugging tools, allows expensive runtime checks to be - turned on. - --config BCACHE_CLOSURES_DEBUG -- bool "Debug closures" -- depends on BCACHE -- select DEBUG_FS -- help -- Keeps all active closures in a linked list and provides a debugfs -- interface to list them, which makes it possible to see asynchronous -- operations that get stuck. -- - config BCACHE_ASYNC_REGISTRATION - bool "Asynchronous device registration" - depends on BCACHE -diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile -index 5b87e5967..054e8a33a 100644 ---- a/drivers/md/bcache/Makefile -+++ b/drivers/md/bcache/Makefile -@@ -2,6 +2,6 @@ - - obj-$(CONFIG_BCACHE) += bcache.o - --bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ -- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ -+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ -+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o features.o -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 5a79bb3c2..7c0d00432 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -179,6 +179,7 @@ - #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ - - #include -+#include - #include - #include - #include -@@ -192,7 +193,6 @@ - #include "bcache_ondisk.h" - #include "bset.h" - #include "util.h" --#include "closure.h" - - struct bucket { - atomic_t pin; -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 0ae2b3676..4affe5875 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2905,7 +2905,6 @@ static int __init bcache_init(void) - goto err; - - bch_debug_init(); -- closure_debug_init(); - - bcache_is_reboot = false; - -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index 6f3cb7c92..f61ab1bad 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -4,6 +4,7 @@ - #define _BCACHE_UTIL_H - - #include -+#include - #include - #include - #include -@@ -13,8 +14,6 @@ - #include - #include - --#include "closure.h" -- - struct closure; - - #ifdef CONFIG_BCACHE_DEBUG -diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c -index b6f4be25b..a09ce965c 100644 ---- a/drivers/mmc/core/block.c -+++ b/drivers/mmc/core/block.c -@@ -2510,7 +2510,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, - - blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); - -- string_get_size((u64)size, 512, STRING_UNITS_2, -+ string_get_size((u64)size, 512, STRING_SIZE_BASE2, - cap_str, sizeof(cap_str)); - pr_info("%s: %s %s %s%s\n", - md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), -@@ -2706,7 +2706,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, - - list_add(&rpmb->node, &md->rpmbs); - -- string_get_size((u64)size, 512, STRING_UNITS_2, -+ string_get_size((u64)size, 512, STRING_SIZE_BASE2, - cap_str, sizeof(cap_str)); - - pr_info("%s: %s %s %s, chardev (%d:%d)\n", -diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c -index e11536fff..9f1ea83e2 100644 ---- a/drivers/mtd/spi-nor/debugfs.c -+++ b/drivers/mtd/spi-nor/debugfs.c -@@ -84,7 +84,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data) - - seq_printf(s, "name\t\t%s\n", info->name); - seq_printf(s, "id\t\t%*ph\n", SPI_NOR_MAX_ID_LEN, nor->id); -- string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); -+ string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); - seq_printf(s, "size\t\t%s\n", buf); - seq_printf(s, "write size\t%u\n", params->writesize); - seq_printf(s, "page size\t%u\n", params->page_size); -@@ -129,14 +129,14 @@ static int spi_nor_params_show(struct seq_file *s, void *data) - struct spi_nor_erase_type *et = &erase_map->erase_type[i]; - - if (et->size) { -- string_get_size(et->size, 1, STRING_UNITS_2, buf, -+ string_get_size(et->size, 1, STRING_SIZE_BASE2, buf, - sizeof(buf)); - seq_printf(s, " %02x (%s) [%d]\n", et->opcode, buf, i); - } - } - - if (!(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) { -- string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); -+ string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); - seq_printf(s, " %02x (%s)\n", SPINOR_OP_CHIP_ERASE, buf); - } - -diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c -index 14e0d989c..7d5fbebd3 100644 ---- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c -+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c -@@ -3457,8 +3457,8 @@ static void mem_region_show(struct seq_file *seq, const char *name, - { - char buf[40]; - -- string_get_size((u64)to - from + 1, 1, STRING_UNITS_2, buf, -- sizeof(buf)); -+ string_get_size((u64)to - from + 1, 1, STRING_SIZE_BASE2, -+ buf, sizeof(buf)); - seq_printf(seq, "%-15s %#x-%#x [%s]\n", name, from, to, buf); - } - -diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c -index 3c668cfb1..c9abe8f9a 100644 ---- a/drivers/scsi/sd.c -+++ b/drivers/scsi/sd.c -@@ -2681,10 +2681,10 @@ sd_print_capacity(struct scsi_disk *sdkp, - if (!sdkp->first_scan && old_capacity == sdkp->capacity) - return; - -- string_get_size(sdkp->capacity, sector_size, -- STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); -- string_get_size(sdkp->capacity, sector_size, -- STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); -+ string_get_size(sdkp->capacity, sector_size, STRING_SIZE_BASE2, -+ cap_str_2, sizeof(cap_str_2)); -+ string_get_size(sdkp->capacity, sector_size, 0, -+ cap_str_10, sizeof(cap_str_10)); - - sd_printk(KERN_NOTICE, sdkp, - "%llu %d-byte logical blocks: (%s/%s)\n", -diff --git a/fs/Kconfig b/fs/Kconfig -index 18d034ec7..b05c45f63 100644 ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -44,6 +44,7 @@ source "fs/ocfs2/Kconfig" - source "fs/btrfs/Kconfig" - source "fs/nilfs2/Kconfig" - source "fs/f2fs/Kconfig" -+source "fs/bcachefs/Kconfig" - source "fs/zonefs/Kconfig" - - endif # BLOCK -diff --git a/fs/Makefile b/fs/Makefile -index e513aaee0..cd357ea45 100644 ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ - obj-$(CONFIG_BTRFS_FS) += btrfs/ - obj-$(CONFIG_GFS2_FS) += gfs2/ - obj-$(CONFIG_F2FS_FS) += f2fs/ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ - obj-$(CONFIG_CEPH_FS) += ceph/ - obj-$(CONFIG_PSTORE) += pstore/ - obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -diff --git a/fs/aio.c b/fs/aio.c -index 77e33619d..5db996acc 100644 ---- a/fs/aio.c -+++ b/fs/aio.c -@@ -1106,6 +1106,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) - kmem_cache_free(kiocb_cachep, iocb); - } - -+struct aio_waiter { -+ struct wait_queue_entry w; -+ size_t min_nr; -+}; -+ - /* aio_complete - * Called when the io request on the given iocb is complete. - */ -@@ -1114,7 +1119,7 @@ static void aio_complete(struct aio_kiocb *iocb) - struct kioctx *ctx = iocb->ki_ctx; - struct aio_ring *ring; - struct io_event *ev_page, *event; -- unsigned tail, pos, head; -+ unsigned tail, pos, head, avail; - unsigned long flags; - - /* -@@ -1156,6 +1161,10 @@ static void aio_complete(struct aio_kiocb *iocb) - ctx->completed_events++; - if (ctx->completed_events > 1) - refill_reqs_available(ctx, head, tail); -+ -+ avail = tail > head -+ ? tail - head -+ : tail + ctx->nr_events - head; - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - pr_debug("added to ring %p at [%u]\n", iocb, tail); -@@ -1176,8 +1185,18 @@ static void aio_complete(struct aio_kiocb *iocb) - */ - smp_mb(); - -- if (waitqueue_active(&ctx->wait)) -- wake_up(&ctx->wait); -+ if (waitqueue_active(&ctx->wait)) { -+ struct aio_waiter *curr, *next; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&ctx->wait.lock, flags); -+ list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) -+ if (avail >= curr->min_nr) { -+ list_del_init_careful(&curr->w.entry); -+ wake_up_process(curr->w.private); -+ } -+ spin_unlock_irqrestore(&ctx->wait.lock, flags); -+ } - } - - static inline void iocb_put(struct aio_kiocb *iocb) -@@ -1290,7 +1309,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, - struct io_event __user *event, - ktime_t until) - { -- long ret = 0; -+ struct hrtimer_sleeper t; -+ struct aio_waiter w; -+ long ret = 0, ret2 = 0; - - /* - * Note that aio_read_events() is being called as the conditional - i.e. -@@ -1306,12 +1327,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, - * the ringbuffer empty. So in practice we should be ok, but it's - * something to be aware of when touching this code. - */ -- if (until == 0) -- aio_read_events(ctx, min_nr, nr, event, &ret); -- else -- wait_event_interruptible_hrtimeout(ctx->wait, -- aio_read_events(ctx, min_nr, nr, event, &ret), -- until); -+ aio_read_events(ctx, min_nr, nr, event, &ret); -+ if (until == 0 || ret < 0 || ret >= min_nr) -+ return ret; -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ if (until != KTIME_MAX) { -+ hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ } -+ -+ init_wait(&w.w); -+ -+ while (1) { -+ unsigned long nr_got = ret; -+ -+ w.min_nr = min_nr - ret; -+ -+ ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE) ?: -+ !t.task ? -ETIME : 0; -+ -+ if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) -+ break; -+ -+ if (nr_got == ret) -+ schedule(); -+ } -+ -+ finish_wait(&ctx->wait, &w.w); -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ - return ret; - } - -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -new file mode 100644 -index 000000000..fb5b24f20 ---- /dev/null -+++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,76 @@ -+ -+config BCACHEFS_FS -+ tristate "bcachefs filesystem support (EXPERIMENTAL)" -+ depends on BLOCK -+ select EXPORTFS -+ select CLOSURES -+ select LIBCRC32C -+ select CRC64 -+ select FS_POSIX_ACL -+ select LZ4_COMPRESS -+ select LZ4_DECOMPRESS -+ select LZ4HC_COMPRESS -+ select LZ4HC_DECOMPRESS -+ select ZLIB_DEFLATE -+ select ZLIB_INFLATE -+ select ZSTD_COMPRESS -+ select ZSTD_DECOMPRESS -+ select CRYPTO_SHA256 -+ select CRYPTO_CHACHA20 -+ select CRYPTO_POLY1305 -+ select KEYS -+ select RAID6_PQ -+ select XOR_BLOCKS -+ select XXHASH -+ select SRCU -+ select SYMBOLIC_ERRNAME -+ select MEAN_AND_VARIANCE -+ help -+ The bcachefs filesystem - a modern, copy on write filesystem, with -+ support for multiple devices, compression, checksumming, etc. -+ -+config BCACHEFS_QUOTA -+ bool "bcachefs quota support" -+ depends on BCACHEFS_FS -+ select QUOTACTL -+ -+config BCACHEFS_POSIX_ACL -+ bool "bcachefs POSIX ACL support" -+ depends on BCACHEFS_FS -+ select FS_POSIX_ACL -+ -+config BCACHEFS_DEBUG_TRANSACTIONS -+ bool "bcachefs runtime info" -+ depends on BCACHEFS_FS -+ default y -+ help -+ This makes the list of running btree transactions available in debugfs. -+ -+ This is a highly useful debugging feature but does add a small amount of overhead. -+ -+config BCACHEFS_DEBUG -+ bool "bcachefs debugging" -+ depends on BCACHEFS_FS -+ help -+ Enables many extra debugging checks and assertions. -+ -+ The resulting code will be significantly slower than normal; you -+ probably shouldn't select this option unless you're a developer. -+ -+config BCACHEFS_TESTS -+ bool "bcachefs unit and performance tests" -+ depends on BCACHEFS_FS -+ help -+ Include some unit and performance tests for the core btree code -+ -+config BCACHEFS_LOCK_TIME_STATS -+ bool "bcachefs lock time statistics" -+ depends on BCACHEFS_FS -+ help -+ Expose statistics for how long we held a lock in debugfs -+ -+config BCACHEFS_NO_LATENCY_ACCT -+ bool "disable latency accounting and time stats" -+ depends on BCACHEFS_FS -+ help -+ This disables device latency tracking and time stats, only for performance testing -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -new file mode 100644 -index 000000000..c87be5fb7 ---- /dev/null -+++ b/fs/bcachefs/Makefile -@@ -0,0 +1,83 @@ -+ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o -+ -+bcachefs-y := \ -+ alloc_background.o \ -+ alloc_foreground.o \ -+ backpointers.o \ -+ bkey.o \ -+ bkey_methods.o \ -+ bkey_sort.o \ -+ bset.o \ -+ btree_cache.o \ -+ btree_gc.o \ -+ btree_io.o \ -+ btree_iter.o \ -+ btree_journal_iter.o \ -+ btree_key_cache.o \ -+ btree_locking.o \ -+ btree_trans_commit.o \ -+ btree_update.o \ -+ btree_update_interior.o \ -+ btree_write_buffer.o \ -+ buckets.o \ -+ buckets_waiting_for_journal.o \ -+ chardev.o \ -+ checksum.o \ -+ clock.o \ -+ compress.o \ -+ counters.o \ -+ debug.o \ -+ dirent.o \ -+ disk_groups.o \ -+ data_update.o \ -+ ec.o \ -+ errcode.o \ -+ error.o \ -+ extents.o \ -+ extent_update.o \ -+ fs.o \ -+ fs-common.o \ -+ fs-ioctl.o \ -+ fs-io.o \ -+ fs-io-buffered.o \ -+ fs-io-direct.o \ -+ fs-io-pagecache.o \ -+ fsck.o \ -+ inode.o \ -+ io.o \ -+ journal.o \ -+ journal_io.o \ -+ journal_reclaim.o \ -+ journal_sb.o \ -+ journal_seq_blacklist.o \ -+ keylist.o \ -+ lru.o \ -+ migrate.o \ -+ move.o \ -+ movinggc.o \ -+ nocow_locking.o \ -+ opts.o \ -+ printbuf.o \ -+ quota.o \ -+ rebalance.o \ -+ recovery.o \ -+ reflink.o \ -+ replicas.o \ -+ sb-clean.o \ -+ sb-members.o \ -+ siphash.o \ -+ six.o \ -+ snapshot.o \ -+ subvolume.o \ -+ super.o \ -+ super-io.o \ -+ sysfs.o \ -+ tests.o \ -+ trace.o \ -+ two_state_shared_lock.o \ -+ util.o \ -+ varint.o \ -+ xattr.o -+ -+bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -new file mode 100644 -index 000000000..b1a488860 ---- /dev/null -+++ b/fs/bcachefs/acl.c -@@ -0,0 +1,412 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#include "bcachefs.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "acl.h" -+#include "fs.h" -+#include "xattr.h" -+ -+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -+{ -+ return sizeof(bch_acl_header) + -+ sizeof(bch_acl_entry_short) * nr_short + -+ sizeof(bch_acl_entry) * nr_long; -+} -+ -+static inline int acl_to_xattr_type(int type) -+{ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; -+ case ACL_TYPE_DEFAULT: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Convert from filesystem to in-memory representation. -+ */ -+static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, -+ const void *value, size_t size) -+{ -+ const void *p, *end = value + size; -+ struct posix_acl *acl; -+ struct posix_acl_entry *out; -+ unsigned count = 0; -+ int ret; -+ -+ if (!value) -+ return NULL; -+ if (size < sizeof(bch_acl_header)) -+ goto invalid; -+ if (((bch_acl_header *)value)->a_version != -+ cpu_to_le32(BCH_ACL_VERSION)) -+ goto invalid; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *entry = p; -+ -+ if (p + sizeof(bch_acl_entry_short) > end) -+ goto invalid; -+ -+ switch (le16_to_cpu(entry->e_tag)) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ case ACL_GROUP: -+ p += sizeof(bch_acl_entry); -+ break; -+ default: -+ goto invalid; -+ } -+ -+ count++; -+ } -+ -+ if (p > end) -+ goto invalid; -+ -+ if (!count) -+ return NULL; -+ -+ acl = allocate_dropping_locks(trans, ret, -+ posix_acl_alloc(count, _gfp)); -+ if (!acl) -+ return ERR_PTR(-ENOMEM); -+ if (ret) { -+ kfree(acl); -+ return ERR_PTR(ret); -+ } -+ -+ out = acl->a_entries; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *in = p; -+ -+ out->e_tag = le16_to_cpu(in->e_tag); -+ out->e_perm = le16_to_cpu(in->e_perm); -+ -+ switch (out->e_tag) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ out->e_uid = make_kuid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ out->e_gid = make_kgid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ } -+ -+ out++; -+ } -+ -+ BUG_ON(out != acl->a_entries + acl->a_count); -+ -+ return acl; -+invalid: -+ pr_err("invalid acl entry"); -+ return ERR_PTR(-EINVAL); -+} -+ -+#define acl_for_each_entry(acl, acl_e) \ -+ for (acl_e = acl->a_entries; \ -+ acl_e < acl->a_entries + acl->a_count; \ -+ acl_e++) -+ -+/* -+ * Convert from in-memory to filesystem representation. -+ */ -+static struct bkey_i_xattr * -+bch2_acl_to_xattr(struct btree_trans *trans, -+ const struct posix_acl *acl, -+ int type) -+{ -+ struct bkey_i_xattr *xattr; -+ bch_acl_header *acl_header; -+ const struct posix_acl_entry *acl_e; -+ void *outptr; -+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; -+ -+ acl_for_each_entry(acl, acl_e) { -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ case ACL_GROUP: -+ nr_long++; -+ break; -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ nr_short++; -+ break; -+ default: -+ return ERR_PTR(-EINVAL); -+ } -+ } -+ -+ acl_len = bch2_acl_size(nr_short, nr_long); -+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); -+ -+ if (u64s > U8_MAX) -+ return ERR_PTR(-E2BIG); -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return xattr; -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = acl_to_xattr_type(type); -+ xattr->v.x_name_len = 0; -+ xattr->v.x_val_len = cpu_to_le16(acl_len); -+ -+ acl_header = xattr_val(&xattr->v); -+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); -+ -+ outptr = (void *) acl_header + sizeof(*acl_header); -+ -+ acl_for_each_entry(acl, acl_e) { -+ bch_acl_entry *entry = outptr; -+ -+ entry->e_tag = cpu_to_le16(acl_e->e_tag); -+ entry->e_perm = cpu_to_le16(acl_e->e_perm); -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ entry->e_id = cpu_to_le32( -+ from_kuid(&init_user_ns, acl_e->e_uid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ entry->e_id = cpu_to_le32( -+ from_kgid(&init_user_ns, acl_e->e_gid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ outptr += sizeof(bch_acl_entry_short); -+ break; -+ } -+ } -+ -+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); -+ -+ return xattr; -+} -+ -+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, -+ struct dentry *dentry, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); -+ struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c_xattr xattr; -+ struct posix_acl *acl = NULL; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, -+ &hash, inode_inum(inode), &search, 0); -+ if (ret) { -+ if (!bch2_err_matches(ret, ENOENT)) -+ acl = ERR_PTR(ret); -+ goto out; -+ } -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) { -+ acl = ERR_PTR(ret); -+ goto out; -+ } -+ -+ xattr = bkey_s_c_to_xattr(k); -+ acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ -+ if (!IS_ERR(acl)) -+ set_cached_acl(&inode->v, type, acl); -+out: -+ if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return acl; -+} -+ -+int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode_u, -+ struct posix_acl *acl, int type) -+{ -+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); -+ int ret; -+ -+ if (type == ACL_TYPE_DEFAULT && -+ !S_ISDIR(inode_u->bi_mode)) -+ return acl ? -EACCES : 0; -+ -+ if (acl) { -+ struct bkey_i_xattr *xattr = -+ bch2_acl_to_xattr(trans, acl, type); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, -+ inum, &xattr->k_i, 0); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(acl_to_xattr_type(type), "", 0); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, -+ inum, &search); -+ } -+ -+ return bch2_err_matches(ret, ENOENT) ? 0 : ret; -+} -+ -+int bch2_set_acl(struct mnt_idmap *idmap, -+ struct dentry *dentry, -+ struct posix_acl *_acl, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter inode_iter = { NULL }; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl; -+ umode_t mode; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ acl = _acl; -+ -+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto btree_err; -+ -+ mode = inode_u.bi_mode; -+ -+ if (type == ACL_TYPE_ACCESS) { -+ ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); -+ if (ret) -+ goto btree_err; -+ } -+ -+ ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); -+ if (ret) -+ goto btree_err; -+ -+ inode_u.bi_ctime = bch2_current_time(c); -+ inode_u.bi_mode = mode; -+ -+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, NULL, 0); -+btree_err: -+ bch2_trans_iter_exit(&trans, &inode_iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_inode_update_after_write(&trans, inode, &inode_u, -+ ATTR_CTIME|ATTR_MODE); -+ -+ set_cached_acl(&inode->v, type, acl); -+err: -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); -+ struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); -+ struct btree_iter iter; -+ struct bkey_s_c_xattr xattr; -+ struct bkey_i_xattr *new; -+ struct posix_acl *acl; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, -+ &hash_info, inum, &search, BTREE_ITER_INTENT); -+ if (ret) -+ return bch2_err_matches(ret, ENOENT) ? 0 : ret; -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ xattr = bkey_s_c_to_xattr(k); -+ if (ret) -+ goto err; -+ -+ acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ ret = PTR_ERR_OR_ZERO(acl); -+ if (IS_ERR_OR_NULL(acl)) -+ goto err; -+ -+ ret = allocate_dropping_locks_errcode(trans, -+ __posix_acl_chmod(&acl, _gfp, mode)); -+ if (ret) -+ goto err; -+ -+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); -+ if (IS_ERR(new)) { -+ ret = PTR_ERR(new); -+ goto err; -+ } -+ -+ new->k.p = iter.pos; -+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -+ *new_acl = acl; -+ acl = NULL; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ if (!IS_ERR_OR_NULL(acl)) -+ kfree(acl); -+ return ret; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h -new file mode 100644 -index 000000000..bb21d8d69 ---- /dev/null -+++ b/fs/bcachefs/acl.h -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ACL_H -+#define _BCACHEFS_ACL_H -+ -+struct bch_inode_unpacked; -+struct bch_hash_info; -+struct bch_inode_info; -+struct posix_acl; -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#define BCH_ACL_VERSION 0x0001 -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+ __le32 e_id; -+} bch_acl_entry; -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+} bch_acl_entry_short; -+ -+typedef struct { -+ __le32 a_version; -+} bch_acl_header; -+ -+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); -+ -+int bch2_set_acl_trans(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, -+ struct posix_acl *, int); -+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -+int bch2_acl_chmod(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, -+ umode_t, struct posix_acl **); -+ -+#else -+ -+static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode_u, -+ struct posix_acl *acl, int type) -+{ -+ return 0; -+} -+ -+static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -+ -+#endif /* _BCACHEFS_ACL_H */ -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -new file mode 100644 -index 000000000..540d94c0c ---- /dev/null -+++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,2157 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "backpointers.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "btree_write_buffer.h" -+#include "buckets.h" -+#include "buckets_waiting_for_journal.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "lru.h" -+#include "recovery.h" -+#include "trace.h" -+#include "varint.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Persistent alloc info: */ -+ -+static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { -+#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, -+ BCH_ALLOC_FIELDS_V1() -+#undef x -+}; -+ -+struct bkey_alloc_unpacked { -+ u64 journal_seq; -+ u8 gen; -+ u8 oldest_gen; -+ u8 data_type; -+ bool need_discard:1; -+ bool need_inc_gen:1; -+#define x(_name, _bits) u##_bits _name; -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+}; -+ -+static inline u64 alloc_field_v1_get(const struct bch_alloc *a, -+ const void **p, unsigned field) -+{ -+ unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; -+ u64 v; -+ -+ if (!(a->fields & (1 << field))) -+ return 0; -+ -+ switch (bytes) { -+ case 1: -+ v = *((const u8 *) *p); -+ break; -+ case 2: -+ v = le16_to_cpup(*p); -+ break; -+ case 4: -+ v = le32_to_cpup(*p); -+ break; -+ case 8: -+ v = le64_to_cpup(*p); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+ return v; -+} -+ -+static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, -+ struct bkey_s_c k) -+{ -+ const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; -+ const void *d = in->data; -+ unsigned idx = 0; -+ -+ out->gen = in->gen; -+ -+#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); -+ BCH_ALLOC_FIELDS_V1() -+#undef x -+} -+ -+static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); -+ const u8 *in = a.v->data; -+ const u8 *end = bkey_val_end(a); -+ unsigned fieldnr = 0; -+ int ret; -+ u64 v; -+ -+ out->gen = a.v->gen; -+ out->oldest_gen = a.v->oldest_gen; -+ out->data_type = a.v->data_type; -+ -+#define x(_name, _bits) \ -+ if (fieldnr < a.v->nr_fields) { \ -+ ret = bch2_varint_decode_fast(in, end, &v); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ } else { \ -+ v = 0; \ -+ } \ -+ out->_name = v; \ -+ if (v != out->_name) \ -+ return -1; \ -+ fieldnr++; -+ -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+ return 0; -+} -+ -+static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); -+ const u8 *in = a.v->data; -+ const u8 *end = bkey_val_end(a); -+ unsigned fieldnr = 0; -+ int ret; -+ u64 v; -+ -+ out->gen = a.v->gen; -+ out->oldest_gen = a.v->oldest_gen; -+ out->data_type = a.v->data_type; -+ out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); -+ out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); -+ out->journal_seq = le64_to_cpu(a.v->journal_seq); -+ -+#define x(_name, _bits) \ -+ if (fieldnr < a.v->nr_fields) { \ -+ ret = bch2_varint_decode_fast(in, end, &v); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ } else { \ -+ v = 0; \ -+ } \ -+ out->_name = v; \ -+ if (v != out->_name) \ -+ return -1; \ -+ fieldnr++; -+ -+ BCH_ALLOC_FIELDS_V2() -+#undef x -+ return 0; -+} -+ -+static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -+{ -+ struct bkey_alloc_unpacked ret = { .gen = 0 }; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_alloc: -+ bch2_alloc_unpack_v1(&ret, k); -+ break; -+ case KEY_TYPE_alloc_v2: -+ bch2_alloc_unpack_v2(&ret, k); -+ break; -+ case KEY_TYPE_alloc_v3: -+ bch2_alloc_unpack_v3(&ret, k); -+ break; -+ } -+ -+ return ret; -+} -+ -+static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) -+{ -+ unsigned i, bytes = offsetof(struct bch_alloc, data); -+ -+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) -+ if (a->fields & (1 << i)) -+ bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; -+ -+ return DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ -+ /* allow for unknown fields */ -+ if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { -+ prt_printf(err, "incorrect value size (%zu < %u)", -+ bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_alloc_unpacked u; -+ -+ if (bch2_alloc_unpack_v2(&u, k)) { -+ prt_printf(err, "unpack error"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_alloc_unpacked u; -+ -+ if (bch2_alloc_unpack_v3(&u, k)) { -+ prt_printf(err, "unpack error"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, struct printbuf *err) -+{ -+ struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); -+ -+ if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { -+ prt_printf(err, "bad val size (%u > %lu)", -+ alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && -+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { -+ prt_printf(err, "invalid backpointers_start"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { -+ prt_printf(err, "invalid data type (got %u should be %u)", -+ a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ switch (a.v->data_type) { -+ case BCH_DATA_free: -+ case BCH_DATA_need_gc_gens: -+ case BCH_DATA_need_discard: -+ if (a.v->dirty_sectors || -+ a.v->cached_sectors || -+ a.v->stripe) { -+ prt_printf(err, "empty data type free but have data"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_sb: -+ case BCH_DATA_journal: -+ case BCH_DATA_btree: -+ case BCH_DATA_user: -+ case BCH_DATA_parity: -+ if (!a.v->dirty_sectors) { -+ prt_printf(err, "data_type %s but dirty_sectors==0", -+ bch2_data_types[a.v->data_type]); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_cached: -+ if (!a.v->cached_sectors || -+ a.v->dirty_sectors || -+ a.v->stripe) { -+ prt_printf(err, "data type inconsistency"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (!a.v->io_time[READ] && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { -+ prt_printf(err, "cached bucket with read_time == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ break; -+ case BCH_DATA_stripe: -+ break; -+ } -+ -+ return 0; -+} -+ -+static inline u64 swab40(u64 x) -+{ -+ return (((x & 0x00000000ffULL) << 32)| -+ ((x & 0x000000ff00ULL) << 16)| -+ ((x & 0x0000ff0000ULL) >> 0)| -+ ((x & 0x00ff000000ULL) >> 16)| -+ ((x & 0xff00000000ULL) >> 32)); -+} -+ -+void bch2_alloc_v4_swab(struct bkey_s k) -+{ -+ struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; -+ struct bch_backpointer *bp, *bps; -+ -+ a->journal_seq = swab64(a->journal_seq); -+ a->flags = swab32(a->flags); -+ a->dirty_sectors = swab32(a->dirty_sectors); -+ a->cached_sectors = swab32(a->cached_sectors); -+ a->io_time[0] = swab64(a->io_time[0]); -+ a->io_time[1] = swab64(a->io_time[1]); -+ a->stripe = swab32(a->stripe); -+ a->nr_external_backpointers = swab32(a->nr_external_backpointers); -+ -+ bps = alloc_v4_backpointers(a); -+ for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { -+ bp->bucket_offset = swab40(bp->bucket_offset); -+ bp->bucket_len = swab32(bp->bucket_len); -+ bch2_bpos_swab(&bp->pos); -+ } -+} -+ -+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bch_alloc_v4 _a; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); -+ unsigned i; -+ -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "gen %u oldest_gen %u data_type %s", -+ a->gen, a->oldest_gen, -+ a->data_type < BCH_DATA_NR -+ ? bch2_data_types[a->data_type] -+ : "(invalid data type)"); -+ prt_newline(out); -+ prt_printf(out, "journal_seq %llu", a->journal_seq); -+ prt_newline(out); -+ prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); -+ prt_newline(out); -+ prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); -+ prt_newline(out); -+ prt_printf(out, "dirty_sectors %u", a->dirty_sectors); -+ prt_newline(out); -+ prt_printf(out, "cached_sectors %u", a->cached_sectors); -+ prt_newline(out); -+ prt_printf(out, "stripe %u", a->stripe); -+ prt_newline(out); -+ prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); -+ prt_newline(out); -+ prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); -+ prt_newline(out); -+ prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); -+ prt_newline(out); -+ prt_printf(out, "fragmentation %llu", a->fragmentation_lru); -+ prt_newline(out); -+ prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); -+ prt_newline(out); -+ -+ if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { -+ struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); -+ const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); -+ -+ prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); -+ printbuf_indent_add(out, 2); -+ -+ for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { -+ prt_newline(out); -+ bch2_backpointer_to_text(out, &bps[i]); -+ } -+ -+ printbuf_indent_sub(out, 2); -+ } -+ -+ printbuf_indent_sub(out, 2); -+} -+ -+void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) -+{ -+ if (k.k->type == KEY_TYPE_alloc_v4) { -+ void *src, *dst; -+ -+ *out = *bkey_s_c_to_alloc_v4(k).v; -+ -+ src = alloc_v4_backpointers(out); -+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); -+ dst = alloc_v4_backpointers(out); -+ -+ if (src < dst) -+ memset(src, 0, dst - src); -+ -+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); -+ } else { -+ struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); -+ -+ *out = (struct bch_alloc_v4) { -+ .journal_seq = u.journal_seq, -+ .flags = u.need_discard, -+ .gen = u.gen, -+ .oldest_gen = u.oldest_gen, -+ .data_type = u.data_type, -+ .stripe_redundancy = u.stripe_redundancy, -+ .dirty_sectors = u.dirty_sectors, -+ .cached_sectors = u.cached_sectors, -+ .io_time[READ] = u.read_time, -+ .io_time[WRITE] = u.write_time, -+ .stripe = u.stripe, -+ }; -+ -+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); -+ } -+} -+ -+static noinline struct bkey_i_alloc_v4 * -+__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bkey_i_alloc_v4 *ret; -+ -+ ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); -+ if (IS_ERR(ret)) -+ return ret; -+ -+ if (k.k->type == KEY_TYPE_alloc_v4) { -+ void *src, *dst; -+ -+ bkey_reassemble(&ret->k_i, k); -+ -+ src = alloc_v4_backpointers(&ret->v); -+ SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); -+ dst = alloc_v4_backpointers(&ret->v); -+ -+ if (src < dst) -+ memset(src, 0, dst - src); -+ -+ SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); -+ set_alloc_v4_u64s(ret); -+ } else { -+ bkey_alloc_v4_init(&ret->k_i); -+ ret->k.p = k.k->p; -+ bch2_alloc_to_v4(k, &ret->v); -+ } -+ return ret; -+} -+ -+static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc_v4 a; -+ -+ if (likely(k.k->type == KEY_TYPE_alloc_v4) && -+ ((a = bkey_s_c_to_alloc_v4(k), true) && -+ BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) -+ return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); -+ -+ return __bch2_alloc_to_v4_mut(trans, k); -+} -+ -+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ return bch2_alloc_to_v4_mut_inlined(trans, k); -+} -+ -+struct bkey_i_alloc_v4 * -+bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos pos) -+{ -+ struct bkey_s_c k; -+ struct bkey_i_alloc_v4 *a; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, -+ BTREE_ITER_WITH_UPDATES| -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (unlikely(ret)) -+ return ERR_PTR(ret); -+ -+ a = bch2_alloc_to_v4_mut_inlined(trans, k); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (unlikely(ret)) -+ goto err; -+ return a; -+err: -+ bch2_trans_iter_exit(trans, iter); -+ return ERR_PTR(ret); -+} -+ -+static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) -+{ -+ *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; -+ -+ pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; -+ return pos; -+} -+ -+static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) -+{ -+ pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; -+ pos.offset += offset; -+ return pos; -+} -+ -+static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) -+{ -+ return k.k->type == KEY_TYPE_bucket_gens -+ ? bkey_s_c_to_bucket_gens(k).v->gens[offset] -+ : 0; -+} -+ -+int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { -+ prt_printf(err, "bad val size (%lu != %zu)", -+ bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { -+ if (i) -+ prt_char(out, ' '); -+ prt_printf(out, "%u", g.v->gens[i]); -+ } -+} -+ -+int bch2_bucket_gens_init(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_alloc_v4 a; -+ struct bkey_i_bucket_gens g; -+ bool have_bucket_gens_key = false; -+ unsigned offset; -+ struct bpos pos; -+ u8 gen; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ /* -+ * Not a fsck error because this is checked/repaired by -+ * bch2_check_alloc_key() which runs later: -+ */ -+ if (!bch2_dev_bucket_exists(c, k.k->p)) -+ continue; -+ -+ gen = bch2_alloc_to_v4(k, &a)->gen; -+ pos = alloc_gens_pos(iter.pos, &offset); -+ -+ if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { -+ ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); -+ if (ret) -+ break; -+ have_bucket_gens_key = false; -+ } -+ -+ if (!have_bucket_gens_key) { -+ bkey_bucket_gens_init(&g.k_i); -+ g.k.p = pos; -+ have_bucket_gens_key = true; -+ } -+ -+ g.v.gens[offset] = gen; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (have_bucket_gens_key && !ret) -+ ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+int bch2_alloc_read(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ int ret; -+ -+ down_read(&c->gc_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { -+ const struct bch_bucket_gens *g; -+ u64 b; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; -+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; -+ -+ if (k.k->type != KEY_TYPE_bucket_gens) -+ continue; -+ -+ g = bkey_s_c_to_bucket_gens(k).v; -+ -+ /* -+ * Not a fsck error because this is checked/repaired by -+ * bch2_check_alloc_key() which runs later: -+ */ -+ if (!bch2_dev_exists2(c, k.k->p.inode)) -+ continue; -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ -+ for (b = max_t(u64, ca->mi.first_bucket, start); -+ b < min_t(u64, ca->mi.nbuckets, end); -+ b++) -+ *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ } else { -+ struct bch_alloc_v4 a; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ /* -+ * Not a fsck error because this is checked/repaired by -+ * bch2_check_alloc_key() which runs later: -+ */ -+ if (!bch2_dev_bucket_exists(c, k.k->p)) -+ continue; -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ -+ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ } -+ -+ bch2_trans_exit(&trans); -+ up_read(&c->gc_lock); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ return ret; -+} -+ -+/* Free space/discard btree: */ -+ -+static int bch2_bucket_do_index(struct btree_trans *trans, -+ struct bkey_s_c alloc_k, -+ const struct bch_alloc_v4 *a, -+ bool set) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); -+ struct btree_iter iter; -+ struct bkey_s_c old; -+ struct bkey_i *k; -+ enum btree_id btree; -+ enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; -+ enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ if (a->data_type != BCH_DATA_free && -+ a->data_type != BCH_DATA_need_discard) -+ return 0; -+ -+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); -+ if (IS_ERR(k)) -+ return PTR_ERR(k); -+ -+ bkey_init(&k->k); -+ k->k.type = new_type; -+ -+ switch (a->data_type) { -+ case BCH_DATA_free: -+ btree = BTREE_ID_freespace; -+ k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); -+ bch2_key_resize(&k->k, 1); -+ break; -+ case BCH_DATA_need_discard: -+ btree = BTREE_ID_need_discard; -+ k->k.p = alloc_k.k->p; -+ break; -+ default: -+ return 0; -+ } -+ -+ old = bch2_bkey_get_iter(trans, &iter, btree, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ ret = bkey_err(old); -+ if (ret) -+ return ret; -+ -+ if (ca->mi.freespace_initialized && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && -+ bch2_trans_inconsistent_on(old.k->type != old_type, trans, -+ "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" -+ " for %s", -+ set ? "setting" : "clearing", -+ bch2_btree_ids[btree], -+ iter.pos.inode, -+ iter.pos.offset, -+ bch2_bkey_types[old.k->type], -+ bch2_bkey_types[old_type], -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ ret = bch2_trans_update(trans, &iter, k, 0); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static noinline int bch2_bucket_gen_update(struct btree_trans *trans, -+ struct bpos bucket, u8 gen) -+{ -+ struct btree_iter iter; -+ unsigned offset; -+ struct bpos pos = alloc_gens_pos(bucket, &offset); -+ struct bkey_i_bucket_gens *g; -+ struct bkey_s_c k; -+ int ret; -+ -+ g = bch2_trans_kmalloc(trans, sizeof(*g)); -+ ret = PTR_ERR_OR_ZERO(g); -+ if (ret) -+ return ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_WITH_UPDATES); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_bucket_gens) { -+ bkey_bucket_gens_init(&g->k_i); -+ g->k.p = iter.pos; -+ } else { -+ bkey_reassemble(&g->k_i, k); -+ } -+ -+ g->v.gens[offset] = gen; -+ -+ ret = bch2_trans_update(trans, &iter, &g->k_i, 0); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_trans_mark_alloc(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_alloc_v4 old_a_convert, *new_a; -+ const struct bch_alloc_v4 *old_a; -+ u64 old_lru, new_lru; -+ int ret = 0; -+ -+ /* -+ * Deletion only happens in the device removal path, with -+ * BTREE_TRIGGER_NORUN: -+ */ -+ BUG_ON(new->k.type != KEY_TYPE_alloc_v4); -+ -+ old_a = bch2_alloc_to_v4(old, &old_a_convert); -+ new_a = &bkey_i_to_alloc_v4(new)->v; -+ -+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type); -+ -+ if (new_a->dirty_sectors > old_a->dirty_sectors || -+ new_a->cached_sectors > old_a->cached_sectors) { -+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); -+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); -+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); -+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); -+ } -+ -+ if (data_type_is_empty(new_a->data_type) && -+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) && -+ !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { -+ new_a->gen++; -+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); -+ } -+ -+ if (old_a->data_type != new_a->data_type || -+ (new_a->data_type == BCH_DATA_free && -+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { -+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?: -+ bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); -+ if (ret) -+ return ret; -+ } -+ -+ if (new_a->data_type == BCH_DATA_cached && -+ !new_a->io_time[READ]) -+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); -+ -+ old_lru = alloc_lru_idx_read(*old_a); -+ new_lru = alloc_lru_idx_read(*new_a); -+ -+ if (old_lru != new_lru) { -+ ret = bch2_lru_change(trans, new->k.p.inode, -+ bucket_to_u64(new->k.p), -+ old_lru, new_lru); -+ if (ret) -+ return ret; -+ } -+ -+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, -+ bch_dev_bkey_exists(c, new->k.p.inode)); -+ -+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) { -+ ret = bch2_lru_change(trans, -+ BCH_LRU_FRAGMENTATION_START, -+ bucket_to_u64(new->k.p), -+ old_a->fragmentation_lru, new_a->fragmentation_lru); -+ if (ret) -+ return ret; -+ } -+ -+ if (old_a->gen != new_a->gen) { -+ ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for -+ * extents style btrees, but works on non-extents btrees: -+ */ -+static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) -+{ -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ -+ if (bkey_err(k)) -+ return k; -+ -+ if (k.k->type) { -+ return k; -+ } else { -+ struct btree_iter iter2; -+ struct bpos next; -+ -+ bch2_trans_copy_iter(&iter2, iter); -+ -+ if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) -+ end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); -+ -+ end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); -+ -+ /* -+ * btree node min/max is a closed interval, upto takes a half -+ * open interval: -+ */ -+ k = bch2_btree_iter_peek_upto(&iter2, end); -+ next = iter2.pos; -+ bch2_trans_iter_exit(iter->trans, &iter2); -+ -+ BUG_ON(next.offset >= iter->pos.offset + U32_MAX); -+ -+ if (bkey_err(k)) -+ return k; -+ -+ bkey_init(hole); -+ hole->p = iter->pos; -+ -+ bch2_key_resize(hole, next.offset - iter->pos.offset); -+ return (struct bkey_s_c) { hole, NULL }; -+ } -+} -+ -+static bool next_bucket(struct bch_fs *c, struct bpos *bucket) -+{ -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ if (bch2_dev_bucket_exists(c, *bucket)) -+ return true; -+ -+ if (bch2_dev_exists2(c, bucket->inode)) { -+ ca = bch_dev_bkey_exists(c, bucket->inode); -+ -+ if (bucket->offset < ca->mi.first_bucket) { -+ bucket->offset = ca->mi.first_bucket; -+ return true; -+ } -+ -+ bucket->inode++; -+ bucket->offset = 0; -+ } -+ -+ rcu_read_lock(); -+ iter = bucket->inode; -+ ca = __bch2_next_dev(c, &iter, NULL); -+ if (ca) -+ *bucket = POS(ca->dev_idx, ca->mi.first_bucket); -+ rcu_read_unlock(); -+ -+ return ca != NULL; -+} -+ -+static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct bkey_s_c k; -+again: -+ k = bch2_get_key_or_hole(iter, POS_MAX, hole); -+ if (bkey_err(k)) -+ return k; -+ -+ if (!k.k->type) { -+ struct bpos bucket = bkey_start_pos(k.k); -+ -+ if (!bch2_dev_bucket_exists(c, bucket)) { -+ if (!next_bucket(c, &bucket)) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bucket); -+ goto again; -+ } -+ -+ if (!bch2_dev_bucket_exists(c, k.k->p)) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); -+ -+ bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); -+ } -+ } -+ -+ return k; -+} -+ -+static noinline_for_stack -+int bch2_check_alloc_key(struct btree_trans *trans, -+ struct bkey_s_c alloc_k, -+ struct btree_iter *alloc_iter, -+ struct btree_iter *discard_iter, -+ struct btree_iter *freespace_iter, -+ struct btree_iter *bucket_gens_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ unsigned discard_key_type, freespace_key_type; -+ unsigned gens_offset; -+ struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, -+ "alloc key for invalid device:bucket %llu:%llu", -+ alloc_k.k->p.inode, alloc_k.k->p.offset)) -+ return bch2_btree_delete_at(trans, alloc_iter, 0); -+ -+ ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); -+ if (!ca->mi.freespace_initialized) -+ return 0; -+ -+ a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ -+ discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; -+ bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); -+ k = bch2_btree_iter_peek_slot(discard_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != discard_key_type && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" -+ " %s", -+ bch2_bkey_types[k.k->type], -+ bch2_bkey_types[discard_key_type], -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { -+ struct bkey_i *update = -+ bch2_trans_kmalloc(trans, sizeof(*update)); -+ -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ goto err; -+ -+ bkey_init(&update->k); -+ update->k.type = discard_key_type; -+ update->k.p = discard_iter->pos; -+ -+ ret = bch2_trans_update(trans, discard_iter, update, 0); -+ if (ret) -+ goto err; -+ } -+ -+ freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; -+ bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); -+ k = bch2_btree_iter_peek_slot(freespace_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != freespace_key_type && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" -+ " %s", -+ bch2_bkey_types[k.k->type], -+ bch2_bkey_types[freespace_key_type], -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { -+ struct bkey_i *update = -+ bch2_trans_kmalloc(trans, sizeof(*update)); -+ -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ goto err; -+ -+ bkey_init(&update->k); -+ update->k.type = freespace_key_type; -+ update->k.p = freespace_iter->pos; -+ bch2_key_resize(&update->k, 1); -+ -+ ret = bch2_trans_update(trans, freespace_iter, update, 0); -+ if (ret) -+ goto err; -+ } -+ -+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); -+ k = bch2_btree_iter_peek_slot(bucket_gens_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (a->gen != alloc_gen(k, gens_offset) && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" -+ " %s", -+ alloc_gen(k, gens_offset), a->gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { -+ struct bkey_i_bucket_gens *g = -+ bch2_trans_kmalloc(trans, sizeof(*g)); -+ -+ ret = PTR_ERR_OR_ZERO(g); -+ if (ret) -+ goto err; -+ -+ if (k.k->type == KEY_TYPE_bucket_gens) { -+ bkey_reassemble(&g->k_i, k); -+ } else { -+ bkey_bucket_gens_init(&g->k_i); -+ g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); -+ } -+ -+ g->v.gens[gens_offset] = a->gen; -+ -+ ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); -+ if (ret) -+ goto err; -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static noinline_for_stack -+int bch2_check_alloc_hole_freespace(struct btree_trans *trans, -+ struct bpos start, -+ struct bpos *end, -+ struct btree_iter *freespace_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca; -+ struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ ca = bch_dev_bkey_exists(c, start.inode); -+ if (!ca->mi.freespace_initialized) -+ return 0; -+ -+ bch2_btree_iter_set_pos(freespace_iter, start); -+ -+ k = bch2_btree_iter_peek_slot(freespace_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ *end = bkey_min(k.k->p, *end); -+ -+ if (k.k->type != KEY_TYPE_set && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "hole in alloc btree missing in freespace btree\n" -+ " device %llu buckets %llu-%llu", -+ freespace_iter->pos.inode, -+ freespace_iter->pos.offset, -+ end->offset))) { -+ struct bkey_i *update = -+ bch2_trans_kmalloc(trans, sizeof(*update)); -+ -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ goto err; -+ -+ bkey_init(&update->k); -+ update->k.type = KEY_TYPE_set; -+ update->k.p = freespace_iter->pos; -+ bch2_key_resize(&update->k, -+ min_t(u64, U32_MAX, end->offset - -+ freespace_iter->pos.offset)); -+ -+ ret = bch2_trans_update(trans, freespace_iter, update, 0); -+ if (ret) -+ goto err; -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static noinline_for_stack -+int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, -+ struct bpos start, -+ struct bpos *end, -+ struct btree_iter *bucket_gens_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ unsigned i, gens_offset, gens_end_offset; -+ int ret; -+ -+ if (c->sb.version < bcachefs_metadata_version_bucket_gens) -+ return 0; -+ -+ bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); -+ -+ k = bch2_btree_iter_peek_slot(bucket_gens_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (bkey_cmp(alloc_gens_pos(start, &gens_offset), -+ alloc_gens_pos(*end, &gens_end_offset))) -+ gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; -+ -+ if (k.k->type == KEY_TYPE_bucket_gens) { -+ struct bkey_i_bucket_gens g; -+ bool need_update = false; -+ -+ bkey_reassemble(&g.k_i, k); -+ -+ for (i = gens_offset; i < gens_end_offset; i++) { -+ if (fsck_err_on(g.v.gens[i], c, -+ "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", -+ bucket_gens_pos_to_alloc(k.k->p, i).inode, -+ bucket_gens_pos_to_alloc(k.k->p, i).offset, -+ g.v.gens[i])) { -+ g.v.gens[i] = 0; -+ need_update = true; -+ } -+ } -+ -+ if (need_update) { -+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g)); -+ -+ ret = PTR_ERR_OR_ZERO(k); -+ if (ret) -+ goto err; -+ -+ memcpy(k, &g, sizeof(g)); -+ -+ ret = bch2_trans_update(trans, bucket_gens_iter, k, 0); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter alloc_iter; -+ struct bkey_s_c alloc_k; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ u64 genbits; -+ struct bpos pos; -+ enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard -+ ? BCH_DATA_need_discard -+ : BCH_DATA_free; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ pos = iter->pos; -+ pos.offset &= ~(~0ULL << 56); -+ genbits = iter->pos.offset & (~0ULL << 56); -+ -+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); -+ ret = bkey_err(alloc_k); -+ if (ret) -+ return ret; -+ -+ if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, -+ "entry in %s btree for nonexistant dev:bucket %llu:%llu", -+ bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) -+ goto delete; -+ -+ a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ -+ if (fsck_err_on(a->data_type != state || -+ (state == BCH_DATA_free && -+ genbits != alloc_freespace_genbits(*a)), c, -+ "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), -+ bch2_btree_ids[iter->btree_id], -+ iter->pos.inode, -+ iter->pos.offset, -+ a->data_type == state, -+ genbits >> 56, alloc_freespace_genbits(*a) >> 56)) -+ goto delete; -+out: -+fsck_err: -+ set_btree_iter_dontneed(&alloc_iter); -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ printbuf_exit(&buf); -+ return ret; -+delete: -+ ret = bch2_btree_delete_extent_at(trans, iter, -+ iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); -+ goto out; -+} -+ -+static int bch2_check_discard_freespace_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end) -+{ -+ if (!btree_id_is_extents(iter->btree_id)) { -+ return __bch2_check_discard_freespace_key(trans, iter); -+ } else { -+ int ret; -+ -+ while (!bkey_eq(iter->pos, end) && -+ !(ret = btree_trans_too_many_iters(trans) ?: -+ __bch2_check_discard_freespace_key(trans, iter))) -+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); -+ -+ return ret; -+ } -+} -+ -+/* -+ * We've already checked that generation numbers in the bucket_gens btree are -+ * valid for buckets that exist; this just checks for keys for nonexistent -+ * buckets. -+ */ -+static noinline_for_stack -+int bch2_check_bucket_gens_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_bucket_gens g; -+ struct bch_dev *ca; -+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; -+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; -+ u64 b; -+ bool need_update = false, dev_exists; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ BUG_ON(k.k->type != KEY_TYPE_bucket_gens); -+ bkey_reassemble(&g.k_i, k); -+ -+ /* if no bch_dev, skip out whether we repair or not */ -+ dev_exists = bch2_dev_exists2(c, k.k->p.inode); -+ if (!dev_exists) { -+ if (fsck_err_on(!dev_exists, c, -+ "bucket_gens key for invalid device:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, 0); -+ } -+ goto out; -+ } -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ if (fsck_err_on(end <= ca->mi.first_bucket || -+ start >= ca->mi.nbuckets, c, -+ "bucket_gens key for invalid buckets:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, 0); -+ goto out; -+ } -+ -+ for (b = start; b < ca->mi.first_bucket; b++) -+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, -+ "bucket_gens key has nonzero gen for invalid bucket")) { -+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; -+ need_update = true; -+ } -+ -+ for (b = ca->mi.nbuckets; b < end; b++) -+ if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, -+ "bucket_gens key has nonzero gen for invalid bucket")) { -+ g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; -+ need_update = true; -+ } -+ -+ if (need_update) { -+ struct bkey_i *k; -+ -+ k = bch2_trans_kmalloc(trans, sizeof(g)); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (ret) -+ goto out; -+ -+ memcpy(k, &g, sizeof(g)); -+ ret = bch2_trans_update(trans, iter, k, 0); -+ } -+out: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_alloc_info(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; -+ struct bkey hole; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ -+ while (1) { -+ struct bpos next; -+ -+ bch2_trans_begin(&trans); -+ -+ k = bch2_get_key_or_real_bucket_hole(&iter, &hole); -+ ret = bkey_err(k); -+ if (ret) -+ goto bkey_err; -+ -+ if (!k.k) -+ break; -+ -+ if (k.k->type) { -+ next = bpos_nosnap_successor(k.k->p); -+ -+ ret = bch2_check_alloc_key(&trans, -+ k, &iter, -+ &discard_iter, -+ &freespace_iter, -+ &bucket_gens_iter); -+ if (ret) -+ goto bkey_err; -+ } else { -+ next = k.k->p; -+ -+ ret = bch2_check_alloc_hole_freespace(&trans, -+ bkey_start_pos(k.k), -+ &next, -+ &freespace_iter) ?: -+ bch2_check_alloc_hole_bucket_gens(&trans, -+ bkey_start_pos(k.k), -+ &next, -+ &bucket_gens_iter); -+ if (ret) -+ goto bkey_err; -+ } -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret) -+ goto bkey_err; -+ -+ bch2_btree_iter_set_pos(&iter, next); -+bkey_err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &bucket_gens_iter); -+ bch2_trans_iter_exit(&trans, &freespace_iter); -+ bch2_trans_iter_exit(&trans, &discard_iter); -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret < 0) -+ goto err; -+ -+ ret = for_each_btree_key2(&trans, iter, -+ BTREE_ID_need_discard, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: -+ for_each_btree_key2(&trans, iter, -+ BTREE_ID_freespace, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_bucket_gens, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ bch2_check_bucket_gens_key(&trans, &iter, k)); -+err: -+ bch2_trans_exit(&trans); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, -+ struct btree_iter *alloc_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter lru_iter; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ struct bkey_s_c alloc_k, lru_k; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ alloc_k = bch2_btree_iter_peek(alloc_iter); -+ if (!alloc_k.k) -+ return 0; -+ -+ ret = bkey_err(alloc_k); -+ if (ret) -+ return ret; -+ -+ a = bch2_alloc_to_v4(alloc_k, &a_convert); -+ -+ if (a->data_type != BCH_DATA_cached) -+ return 0; -+ -+ lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, -+ lru_pos(alloc_k.k->p.inode, -+ bucket_to_u64(alloc_k.k->p), -+ a->io_time[READ]), 0); -+ ret = bkey_err(lru_k); -+ if (ret) -+ return ret; -+ -+ if (fsck_err_on(!a->io_time[READ], c, -+ "cached bucket with read_time 0\n" -+ " %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || -+ fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, -+ "missing lru entry\n" -+ " %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -+ u64 read_time = a->io_time[READ] ?: -+ atomic64_read(&c->io_clock[READ].now); -+ -+ ret = bch2_lru_set(trans, -+ alloc_k.k->p.inode, -+ bucket_to_u64(alloc_k.k->p), -+ read_time); -+ if (ret) -+ goto err; -+ -+ if (a->io_time[READ] != read_time) { -+ struct bkey_i_alloc_v4 *a_mut = -+ bch2_alloc_to_v4_mut(trans, alloc_k); -+ ret = PTR_ERR_OR_ZERO(a_mut); -+ if (ret) -+ goto err; -+ -+ a_mut->v.io_time[READ] = read_time; -+ ret = bch2_trans_update(trans, alloc_iter, -+ &a_mut->k_i, BTREE_TRIGGER_NORUN); -+ if (ret) -+ goto err; -+ } -+ } -+err: -+fsck_err: -+ bch2_trans_iter_exit(trans, &lru_iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_alloc_to_lru_refs(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, -+ POS_MIN, BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ bch2_check_alloc_to_lru_ref(&trans, &iter))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int bch2_discard_one_bucket(struct btree_trans *trans, -+ struct btree_iter *need_discard_iter, -+ struct bpos *discard_pos_done, -+ u64 *seen, -+ u64 *open, -+ u64 *need_journal_commit, -+ u64 *discarded) -+{ -+ struct bch_fs *c = trans->c; -+ struct bpos pos = need_discard_iter->pos; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ struct bkey_i_alloc_v4 *a; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ ca = bch_dev_bkey_exists(c, pos.inode); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); -+ return 0; -+ } -+ -+ if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { -+ (*open)++; -+ goto out; -+ } -+ -+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -+ c->journal.flushed_seq_ondisk, -+ pos.inode, pos.offset)) { -+ (*need_journal_commit)++; -+ goto out; -+ } -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, -+ need_discard_iter->pos, -+ BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) -+ goto out; -+ -+ a = bch2_alloc_to_v4_mut(trans, k); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { -+ a->v.gen++; -+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); -+ goto write; -+ } -+ -+ if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { -+ bch2_trans_inconsistent(trans, -+ "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" -+ "%s", -+ a->v.journal_seq, -+ c->journal.flushed_seq_ondisk, -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ } -+ goto out; -+ } -+ -+ if (a->v.data_type != BCH_DATA_need_discard) { -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { -+ bch2_trans_inconsistent(trans, -+ "bucket incorrectly set in need_discard btree\n" -+ "%s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ } -+ -+ goto out; -+ } -+ -+ if (!bkey_eq(*discard_pos_done, iter.pos) && -+ ca->mi.discard && !c->opts.nochanges) { -+ /* -+ * This works without any other locks because this is the only -+ * thread that removes items from the need_discard tree -+ */ -+ bch2_trans_unlock(trans); -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ k.k->p.offset * ca->mi.bucket_size, -+ ca->mi.bucket_size, -+ GFP_KERNEL); -+ *discard_pos_done = iter.pos; -+ -+ ret = bch2_trans_relock_notrace(trans); -+ if (ret) -+ goto out; -+ } -+ -+ SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); -+ a->v.data_type = alloc_data_type(a->v, a->v.data_type); -+write: -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BCH_WATERMARK_btree| -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ goto out; -+ -+ this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); -+ (*discarded)++; -+out: -+ (*seen)++; -+ bch2_trans_iter_exit(trans, &iter); -+ percpu_ref_put(&ca->io_ref); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static void bch2_do_discards_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, discard_work); -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; -+ struct bpos discard_pos_done = POS_MAX; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * We're doing the commit in bch2_discard_one_bucket instead of using -+ * for_each_btree_key_commit() so that we can increment counters after -+ * successful commit: -+ */ -+ ret = for_each_btree_key2(&trans, iter, -+ BTREE_ID_need_discard, POS_MIN, 0, k, -+ bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, -+ &seen, -+ &open, -+ &need_journal_commit, -+ &discarded)); -+ -+ bch2_trans_exit(&trans); -+ -+ if (need_journal_commit * 2 > seen) -+ bch2_journal_flush_async(&c->journal, NULL); -+ -+ bch2_write_ref_put(c, BCH_WRITE_REF_discard); -+ -+ trace_discard_buckets(c, seen, open, need_journal_commit, discarded, -+ bch2_err_str(ret)); -+} -+ -+void bch2_do_discards(struct bch_fs *c) -+{ -+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && -+ !queue_work(c->write_ref_wq, &c->discard_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_discard); -+} -+ -+static int invalidate_one_bucket(struct btree_trans *trans, -+ struct btree_iter *lru_iter, -+ struct bkey_s_c lru_k, -+ s64 *nr_to_invalidate) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter alloc_iter = { NULL }; -+ struct bkey_i_alloc_v4 *a = NULL; -+ struct printbuf buf = PRINTBUF; -+ struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); -+ unsigned cached_sectors; -+ int ret = 0; -+ -+ if (*nr_to_invalidate <= 0) -+ return 1; -+ -+ if (!bch2_dev_bucket_exists(c, bucket)) { -+ prt_str(&buf, "lru entry points to invalid bucket"); -+ goto err; -+ } -+ -+ if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) -+ return 0; -+ -+ a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ /* We expect harmless races here due to the btree write buffer: */ -+ if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) -+ goto out; -+ -+ BUG_ON(a->v.data_type != BCH_DATA_cached); -+ -+ if (!a->v.cached_sectors) -+ bch_err(c, "invalidating empty bucket, confused"); -+ -+ cached_sectors = a->v.cached_sectors; -+ -+ SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); -+ a->v.gen++; -+ a->v.data_type = 0; -+ a->v.dirty_sectors = 0; -+ a->v.cached_sectors = 0; -+ a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); -+ a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); -+ -+ ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, -+ BTREE_TRIGGER_BUCKET_INVALIDATE) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BCH_WATERMARK_btree| -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ goto out; -+ -+ trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); -+ --*nr_to_invalidate; -+out: -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ printbuf_exit(&buf); -+ return ret; -+err: -+ prt_str(&buf, "\n lru key: "); -+ bch2_bkey_val_to_text(&buf, c, lru_k); -+ -+ prt_str(&buf, "\n lru entry: "); -+ bch2_lru_pos_to_text(&buf, lru_iter->pos); -+ -+ prt_str(&buf, "\n alloc key: "); -+ if (!a) -+ bch2_bpos_to_text(&buf, bucket); -+ else -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); -+ -+ bch_err(c, "%s", buf.buf); -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { -+ bch2_inconsistent_error(c); -+ ret = -EINVAL; -+ } -+ -+ goto out; -+} -+ -+static void bch2_do_invalidates_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); -+ struct bch_dev *ca; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ unsigned i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_btree_write_buffer_flush(&trans); -+ if (ret) -+ goto err; -+ -+ for_each_member_device(ca, c, i) { -+ s64 nr_to_invalidate = -+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, -+ lru_pos(ca->dev_idx, 0, 0), -+ lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), -+ BTREE_ITER_INTENT, k, -+ invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); -+ -+ if (ret < 0) { -+ percpu_ref_put(&ca->ref); -+ break; -+ } -+ } -+err: -+ bch2_trans_exit(&trans); -+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -+} -+ -+void bch2_do_invalidates(struct bch_fs *c) -+{ -+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && -+ !queue_work(c->write_ref_wq, &c->invalidate_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); -+} -+ -+static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, -+ unsigned long *last_updated) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey hole; -+ struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets); -+ struct bch_member *m; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, -+ POS(ca->dev_idx, ca->mi.first_bucket), -+ BTREE_ITER_PREFETCH); -+ /* -+ * Scan the alloc btree for every bucket on @ca, and add buckets to the -+ * freespace/need_discard/need_gc_gens btrees as needed: -+ */ -+ while (1) { -+ if (*last_updated + HZ * 10 < jiffies) { -+ bch_info(ca, "%s: currently at %llu/%llu", -+ __func__, iter.pos.offset, ca->mi.nbuckets); -+ *last_updated = jiffies; -+ } -+ -+ bch2_trans_begin(&trans); -+ -+ if (bkey_ge(iter.pos, end)) { -+ ret = 0; -+ break; -+ } -+ -+ k = bch2_get_key_or_hole(&iter, end, &hole); -+ ret = bkey_err(k); -+ if (ret) -+ goto bkey_err; -+ -+ if (k.k->type) { -+ /* -+ * We process live keys in the alloc btree one at a -+ * time: -+ */ -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); -+ -+ ret = bch2_bucket_do_index(&trans, k, a, true) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ goto bkey_err; -+ -+ bch2_btree_iter_advance(&iter); -+ } else { -+ struct bkey_i *freespace; -+ -+ freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace)); -+ ret = PTR_ERR_OR_ZERO(freespace); -+ if (ret) -+ goto bkey_err; -+ -+ bkey_init(&freespace->k); -+ freespace->k.type = KEY_TYPE_set; -+ freespace->k.p = k.k->p; -+ freespace->k.size = k.k->size; -+ -+ ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ goto bkey_err; -+ -+ bch2_btree_iter_set_pos(&iter, k.k->p); -+ } -+bkey_err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ -+ if (ret < 0) { -+ bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; -+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_fs_freespace_init(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ bool doing_init = false; -+ unsigned long last_updated = jiffies; -+ -+ /* -+ * We can crash during the device add path, so we need to check this on -+ * every mount: -+ */ -+ -+ for_each_member_device(ca, c, i) { -+ if (ca->mi.freespace_initialized) -+ continue; -+ -+ if (!doing_init) { -+ bch_info(c, "initializing freespace"); -+ doing_init = true; -+ } -+ -+ ret = bch2_dev_freespace_init(c, ca, &last_updated); -+ if (ret) { -+ percpu_ref_put(&ca->ref); -+ bch_err_fn(c, ret); -+ return ret; -+ } -+ } -+ -+ if (doing_init) { -+ mutex_lock(&c->sb_lock); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ bch_verbose(c, "done initializing freespace"); -+ } -+ -+ return 0; -+} -+ -+/* Bucket IO clocks: */ -+ -+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, -+ size_t bucket_nr, int rw) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_i_alloc_v4 *a; -+ u64 now; -+ int ret = 0; -+ -+ a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ return ret; -+ -+ now = atomic64_read(&c->io_clock[rw].now); -+ if (a->v.io_time[rw] == now) -+ goto out; -+ -+ a->v.io_time[rw] = now; -+ -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, 0); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* Startup/shutdown (ro/rw): */ -+ -+void bch2_recalc_capacity(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ u64 capacity = 0, reserved_sectors = 0, gc_reserve; -+ unsigned bucket_size_max = 0; -+ unsigned long ra_pages = 0; -+ unsigned i; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ for_each_online_member(ca, c, i) { -+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; -+ -+ ra_pages += bdi->ra_pages; -+ } -+ -+ bch2_set_ra_pages(c, ra_pages); -+ -+ for_each_rw_member(ca, c, i) { -+ u64 dev_reserve = 0; -+ -+ /* -+ * We need to reserve buckets (from the number -+ * of currently available buckets) against -+ * foreground writes so that mainly copygc can -+ * make forward progress. -+ * -+ * We need enough to refill the various reserves -+ * from scratch - copygc will use its entire -+ * reserve all at once, then run against when -+ * its reserve is refilled (from the formerly -+ * available buckets). -+ * -+ * This reserve is just used when considering if -+ * allocations for foreground writes must wait - -+ * not -ENOSPC calculations. -+ */ -+ -+ dev_reserve += ca->nr_btree_reserve * 2; -+ dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ -+ -+ dev_reserve += 1; /* btree write point */ -+ dev_reserve += 1; /* copygc write point */ -+ dev_reserve += 1; /* rebalance write point */ -+ -+ dev_reserve *= ca->mi.bucket_size; -+ -+ capacity += bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket); -+ -+ reserved_sectors += dev_reserve * 2; -+ -+ bucket_size_max = max_t(unsigned, bucket_size_max, -+ ca->mi.bucket_size); -+ } -+ -+ gc_reserve = c->opts.gc_reserve_bytes -+ ? c->opts.gc_reserve_bytes >> 9 -+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); -+ -+ reserved_sectors = max(gc_reserve, reserved_sectors); -+ -+ reserved_sectors = min(reserved_sectors, capacity); -+ -+ c->capacity = capacity - reserved_sectors; -+ -+ c->bucket_size_max = bucket_size_max; -+ -+ /* Wake up case someone was waiting for buckets */ -+ closure_wake_up(&c->freelist_wait); -+} -+ -+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct open_bucket *ob; -+ bool ret = false; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list && -+ ob->dev == ca->dev_idx) -+ ret = true; -+ spin_unlock(&ob->lock); -+ } -+ -+ return ret; -+} -+ -+/* device goes ro: */ -+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ /* First, remove device from allocation groups: */ -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ clear_bit(ca->dev_idx, c->rw_devs[i].d); -+ -+ /* -+ * Capacity is calculated based off of devices in allocation groups: -+ */ -+ bch2_recalc_capacity(c); -+ -+ bch2_open_buckets_stop(c, ca, false); -+ -+ /* -+ * Wake up threads that were blocked on allocation, so they can notice -+ * the device can no longer be removed and the capacity has changed: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ -+ /* -+ * journal_res_get() can block waiting for free space in the journal - -+ * it needs to notice there may not be devices to allocate from anymore: -+ */ -+ wake_up(&c->journal.wait); -+ -+ /* Now wait for any in flight writes: */ -+ -+ closure_wait_event(&c->open_buckets_wait, -+ !bch2_dev_has_open_write_point(c, ca)); -+} -+ -+/* device goes rw: */ -+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ if (ca->mi.data_allowed & (1 << i)) -+ set_bit(ca->dev_idx, c->rw_devs[i].d); -+} -+ -+void bch2_fs_allocator_background_init(struct bch_fs *c) -+{ -+ spin_lock_init(&c->freelist_lock); -+ INIT_WORK(&c->discard_work, bch2_do_discards_work); -+ INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); -+} -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -new file mode 100644 -index 000000000..c0914feb5 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -+#define _BCACHEFS_ALLOC_BACKGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "buckets.h" -+#include "debug.h" -+#include "super.h" -+ -+enum bkey_invalid_flags; -+ -+/* How out of date a pointer gen is allowed to be: */ -+#define BUCKET_GC_GEN_MAX 96U -+ -+static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) -+{ -+ struct bch_dev *ca; -+ -+ if (!bch2_dev_exists2(c, pos.inode)) -+ return false; -+ -+ ca = bch_dev_bkey_exists(c, pos.inode); -+ return pos.offset >= ca->mi.first_bucket && -+ pos.offset < ca->mi.nbuckets; -+} -+ -+static inline u64 bucket_to_u64(struct bpos bucket) -+{ -+ return (bucket.inode << 48) | bucket.offset; -+} -+ -+static inline struct bpos u64_to_bucket(u64 bucket) -+{ -+ return POS(bucket >> 48, bucket & ~(~0ULL << 48)); -+} -+ -+static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) -+{ -+ return a.gen - a.oldest_gen; -+} -+ -+static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, -+ u32 cached_sectors, -+ u32 stripe, -+ struct bch_alloc_v4 a, -+ enum bch_data_type data_type) -+{ -+ if (stripe) -+ return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; -+ if (dirty_sectors) -+ return data_type; -+ if (cached_sectors) -+ return BCH_DATA_cached; -+ if (BCH_ALLOC_V4_NEED_DISCARD(&a)) -+ return BCH_DATA_need_discard; -+ if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) -+ return BCH_DATA_need_gc_gens; -+ return BCH_DATA_free; -+} -+ -+static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, -+ enum bch_data_type data_type) -+{ -+ return __alloc_data_type(a.dirty_sectors, a.cached_sectors, -+ a.stripe, a, data_type); -+} -+ -+static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) -+{ -+ return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; -+} -+ -+static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) -+{ -+ return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; -+} -+ -+#define DATA_TYPES_MOVABLE \ -+ ((1U << BCH_DATA_btree)| \ -+ (1U << BCH_DATA_user)| \ -+ (1U << BCH_DATA_stripe)) -+ -+static inline bool data_type_movable(enum bch_data_type type) -+{ -+ return (1U << type) & DATA_TYPES_MOVABLE; -+} -+ -+static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, -+ struct bch_dev *ca) -+{ -+ if (!data_type_movable(a.data_type) || -+ a.dirty_sectors >= ca->mi.bucket_size) -+ return 0; -+ -+ return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); -+} -+ -+static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) -+{ -+ return ((u64) alloc_gc_gen(a) >> 4) << 56; -+} -+ -+static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) -+{ -+ pos.offset |= alloc_freespace_genbits(a); -+ return pos; -+} -+ -+static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) -+{ -+ unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: -+ BCH_ALLOC_V4_U64s_V0) + -+ BCH_ALLOC_V4_NR_BACKPOINTERS(a) * -+ (sizeof(struct bch_backpointer) / sizeof(u64)); -+ -+ BUG_ON(ret > U8_MAX - BKEY_U64s); -+ return ret; -+} -+ -+static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) -+{ -+ set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); -+} -+ -+struct bkey_i_alloc_v4 * -+bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); -+ -+void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); -+ -+static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) -+{ -+ const struct bch_alloc_v4 *ret; -+ -+ if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) -+ goto slowpath; -+ -+ ret = bkey_s_c_to_alloc_v4(k).v; -+ if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) -+ goto slowpath; -+ -+ return ret; -+slowpath: -+ __bch2_alloc_to_v4(k, convert); -+ return convert; -+} -+ -+struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); -+ -+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -+ -+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_alloc_v4_swab(struct bkey_s); -+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_v1_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+ .trans_trigger = bch2_trans_mark_alloc, \ -+ .atomic_trigger = bch2_mark_alloc, \ -+ .min_val_size = 8, \ -+}) -+ -+#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_v2_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+ .trans_trigger = bch2_trans_mark_alloc, \ -+ .atomic_trigger = bch2_mark_alloc, \ -+ .min_val_size = 8, \ -+}) -+ -+#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_v3_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+ .trans_trigger = bch2_trans_mark_alloc, \ -+ .atomic_trigger = bch2_mark_alloc, \ -+ .min_val_size = 16, \ -+}) -+ -+#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_v4_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+ .swab = bch2_alloc_v4_swab, \ -+ .trans_trigger = bch2_trans_mark_alloc, \ -+ .atomic_trigger = bch2_mark_alloc, \ -+ .min_val_size = 48, \ -+}) -+ -+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ -+ .key_invalid = bch2_bucket_gens_invalid, \ -+ .val_to_text = bch2_bucket_gens_to_text, \ -+}) -+ -+int bch2_bucket_gens_init(struct bch_fs *); -+ -+static inline bool bkey_is_alloc(const struct bkey *k) -+{ -+ return k->type == KEY_TYPE_alloc || -+ k->type == KEY_TYPE_alloc_v2 || -+ k->type == KEY_TYPE_alloc_v3; -+} -+ -+int bch2_alloc_read(struct bch_fs *); -+ -+int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_i *, unsigned); -+int bch2_check_alloc_info(struct bch_fs *); -+int bch2_check_alloc_to_lru_refs(struct bch_fs *); -+void bch2_do_discards(struct bch_fs *); -+ -+static inline u64 should_invalidate_buckets(struct bch_dev *ca, -+ struct bch_dev_usage u) -+{ -+ u64 want_free = ca->mi.nbuckets >> 7; -+ u64 free = max_t(s64, 0, -+ u.d[BCH_DATA_free].buckets -+ + u.d[BCH_DATA_need_discard].buckets -+ - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); -+ -+ return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); -+} -+ -+void bch2_do_invalidates(struct bch_fs *); -+ -+static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) -+{ -+ return (void *) ((u64 *) &a->v + -+ (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: -+ BCH_ALLOC_V4_U64s_V0)); -+} -+ -+static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) -+{ -+ return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); -+} -+ -+int bch2_fs_freespace_init(struct bch_fs *); -+ -+void bch2_recalc_capacity(struct bch_fs *); -+ -+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -+ -+void bch2_fs_allocator_background_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -new file mode 100644 -index 000000000..e02749ddc ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1571 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright 2012 Google, Inc. -+ * -+ * Foreground allocator code: allocate buckets from freelist, and allocate in -+ * sector granularity from writepoints. -+ * -+ * bch2_bucket_alloc() allocates a single bucket from a specific device. -+ * -+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices -+ * in a given filesystem. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "backpointers.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "buckets_waiting_for_journal.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "movinggc.h" -+#include "nocow_locking.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+ -+static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, -+ struct mutex *lock) -+{ -+ if (!mutex_trylock(lock)) { -+ bch2_trans_unlock(trans); -+ mutex_lock(lock); -+ } -+} -+ -+const char * const bch2_watermarks[] = { -+#define x(t) #t, -+ BCH_WATERMARKS() -+#undef x -+ NULL -+}; -+ -+/* -+ * Open buckets represent a bucket that's currently being allocated from. They -+ * serve two purposes: -+ * -+ * - They track buckets that have been partially allocated, allowing for -+ * sub-bucket sized allocations - they're used by the sector allocator below -+ * -+ * - They provide a reference to the buckets they own that mark and sweep GC -+ * can find, until the new allocation has a pointer to it inserted into the -+ * btree -+ * -+ * When allocating some space with the sector allocator, the allocation comes -+ * with a reference to an open bucket - the caller is required to put that -+ * reference _after_ doing the index update that makes its allocation reachable. -+ */ -+ -+void bch2_reset_alloc_cursors(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, NULL) -+ ca->alloc_cursor = 0; -+ rcu_read_unlock(); -+} -+ -+static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) -+{ -+ open_bucket_idx_t idx = ob - c->open_buckets; -+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); -+ -+ ob->hash = *slot; -+ *slot = idx; -+} -+ -+static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) -+{ -+ open_bucket_idx_t idx = ob - c->open_buckets; -+ open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); -+ -+ while (*slot != idx) { -+ BUG_ON(!*slot); -+ slot = &c->open_buckets[*slot].hash; -+ } -+ -+ *slot = ob->hash; -+ ob->hash = 0; -+} -+ -+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ -+ if (ob->ec) { -+ ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); -+ return; -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&ob->lock); -+ -+ ob->valid = false; -+ ob->data_type = 0; -+ -+ spin_unlock(&ob->lock); -+ percpu_up_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ bch2_open_bucket_hash_remove(c, ob); -+ -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ -+ c->open_buckets_nr_free++; -+ ca->nr_open_buckets--; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *c, -+ struct open_buckets *obs, -+ unsigned dev) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->dev == dev && ob->ec) -+ bch2_ec_bucket_cancel(c, ob); -+} -+ -+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); -+ -+ ob = c->open_buckets + c->open_buckets_freelist; -+ c->open_buckets_freelist = ob->freelist; -+ atomic_set(&ob->pin, 1); -+ ob->data_type = 0; -+ -+ c->open_buckets_nr_free--; -+ return ob; -+} -+ -+static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) -+{ -+ BUG_ON(c->open_buckets_partial_nr >= -+ ARRAY_SIZE(c->open_buckets_partial)); -+ -+ spin_lock(&c->freelist_lock); -+ ob->on_partial_list = true; -+ c->open_buckets_partial[c->open_buckets_partial_nr++] = -+ ob - c->open_buckets; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+ closure_wake_up(&c->freelist_wait); -+} -+ -+/* _only_ for allocating the journal on a new device: */ -+long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -+{ -+ while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { -+ u64 b = ca->new_fs_bucket_idx++; -+ -+ if (!is_superblock_bucket(ca, b) && -+ (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) -+ return b; -+ } -+ -+ return -1; -+} -+ -+static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -+{ -+ switch (watermark) { -+ case BCH_WATERMARK_reclaim: -+ return 0; -+ case BCH_WATERMARK_btree: -+ case BCH_WATERMARK_btree_copygc: -+ return OPEN_BUCKETS_COUNT / 4; -+ case BCH_WATERMARK_copygc: -+ return OPEN_BUCKETS_COUNT / 3; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ -+static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ u64 bucket, -+ enum bch_watermark watermark, -+ const struct bch_alloc_v4 *a, -+ struct bucket_alloc_state *s, -+ struct closure *cl) -+{ -+ struct open_bucket *ob; -+ -+ if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { -+ s->skipped_nouse++; -+ return NULL; -+ } -+ -+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { -+ s->skipped_open++; -+ return NULL; -+ } -+ -+ if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -+ c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { -+ s->skipped_need_journal_commit++; -+ return NULL; -+ } -+ -+ if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { -+ s->skipped_nocow++; -+ return NULL; -+ } -+ -+ spin_lock(&c->freelist_lock); -+ -+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { -+ if (cl) -+ closure_wait(&c->open_buckets_wait, cl); -+ -+ if (!c->blocked_allocate_open_bucket) -+ c->blocked_allocate_open_bucket = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ return ERR_PTR(-BCH_ERR_open_buckets_empty); -+ } -+ -+ /* Recheck under lock: */ -+ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { -+ spin_unlock(&c->freelist_lock); -+ s->skipped_open++; -+ return NULL; -+ } -+ -+ ob = bch2_open_bucket_alloc(c); -+ -+ spin_lock(&ob->lock); -+ -+ ob->valid = true; -+ ob->sectors_free = ca->mi.bucket_size; -+ ob->dev = ca->dev_idx; -+ ob->gen = a->gen; -+ ob->bucket = bucket; -+ spin_unlock(&ob->lock); -+ -+ ca->nr_open_buckets++; -+ bch2_open_bucket_hash_add(c, ob); -+ -+ if (c->blocked_allocate_open_bucket) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate_open_bucket], -+ c->blocked_allocate_open_bucket); -+ c->blocked_allocate_open_bucket = 0; -+ } -+ -+ if (c->blocked_allocate) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate], -+ c->blocked_allocate); -+ c->blocked_allocate = 0; -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ return ob; -+} -+ -+static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, -+ enum bch_watermark watermark, u64 free_entry, -+ struct bucket_alloc_state *s, -+ struct bkey_s_c freespace_k, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ struct open_bucket *ob; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ u64 b = free_entry & ~(~0ULL << 56); -+ unsigned genbits = free_entry >> 56; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { -+ prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" -+ " freespace key ", -+ ca->mi.first_bucket, ca->mi.nbuckets); -+ bch2_bkey_val_to_text(&buf, c, freespace_k); -+ bch2_trans_inconsistent(trans, "%s", buf.buf); -+ ob = ERR_PTR(-EIO); -+ goto err; -+ } -+ -+ k = bch2_bkey_get_iter(trans, &iter, -+ BTREE_ID_alloc, POS(ca->dev_idx, b), -+ BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) { -+ ob = ERR_PTR(ret); -+ goto err; -+ } -+ -+ a = bch2_alloc_to_v4(k, &a_convert); -+ -+ if (a->data_type != BCH_DATA_free) { -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { -+ ob = NULL; -+ goto err; -+ } -+ -+ prt_printf(&buf, "non free bucket in freespace btree\n" -+ " freespace key "); -+ bch2_bkey_val_to_text(&buf, c, freespace_k); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ bch2_trans_inconsistent(trans, "%s", buf.buf); -+ ob = ERR_PTR(-EIO); -+ goto err; -+ } -+ -+ if (genbits != (alloc_freespace_genbits(*a) >> 56) && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { -+ prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" -+ " freespace key ", -+ genbits, alloc_freespace_genbits(*a) >> 56); -+ bch2_bkey_val_to_text(&buf, c, freespace_k); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ bch2_trans_inconsistent(trans, "%s", buf.buf); -+ ob = ERR_PTR(-EIO); -+ goto err; -+ } -+ -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { -+ struct bch_backpointer bp; -+ struct bpos bp_pos = POS_MIN; -+ -+ ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, -+ &bp_pos, &bp, -+ BTREE_ITER_NOPRESERVE); -+ if (ret) { -+ ob = ERR_PTR(ret); -+ goto err; -+ } -+ -+ if (!bkey_eq(bp_pos, POS_MAX)) { -+ /* -+ * Bucket may have data in it - we don't call -+ * bc2h_trans_inconnsistent() because fsck hasn't -+ * finished yet -+ */ -+ ob = NULL; -+ goto err; -+ } -+ } -+ -+ ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); -+ if (!ob) -+ iter.path->preserve = false; -+err: -+ if (iter.trans && iter.path) -+ set_btree_iter_dontneed(&iter); -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ob; -+} -+ -+/* -+ * This path is for before the freespace btree is initialized: -+ * -+ * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & -+ * journal buckets - journal buckets will be < ca->new_fs_bucket_idx -+ */ -+static noinline struct open_bucket * -+bch2_bucket_alloc_early(struct btree_trans *trans, -+ struct bch_dev *ca, -+ enum bch_watermark watermark, -+ struct bucket_alloc_state *s, -+ struct closure *cl) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct open_bucket *ob = NULL; -+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); -+ u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); -+ int ret; -+again: -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), -+ BTREE_ITER_SLOTS, k, ret) { -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ -+ if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) -+ break; -+ -+ if (ca->new_fs_bucket_idx && -+ is_superblock_bucket(ca, k.k->p.offset)) -+ continue; -+ -+ a = bch2_alloc_to_v4(k, &a_convert); -+ -+ if (a->data_type != BCH_DATA_free) -+ continue; -+ -+ s->buckets_seen++; -+ -+ ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); -+ if (ob) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ ca->alloc_cursor = alloc_cursor; -+ -+ if (!ob && ret) -+ ob = ERR_PTR(ret); -+ -+ if (!ob && alloc_cursor > alloc_start) { -+ alloc_cursor = alloc_start; -+ goto again; -+ } -+ -+ return ob; -+} -+ -+static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, -+ struct bch_dev *ca, -+ enum bch_watermark watermark, -+ struct bucket_alloc_state *s, -+ struct closure *cl) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct open_bucket *ob = NULL; -+ u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); -+ u64 alloc_cursor = alloc_start; -+ int ret; -+ -+ BUG_ON(ca->new_fs_bucket_idx); -+again: -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, -+ POS(ca->dev_idx, alloc_cursor), 0, k, ret) { -+ if (k.k->p.inode != ca->dev_idx) -+ break; -+ -+ for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); -+ alloc_cursor < k.k->p.offset; -+ alloc_cursor++) { -+ ret = btree_trans_too_many_iters(trans); -+ if (ret) { -+ ob = ERR_PTR(ret); -+ break; -+ } -+ -+ s->buckets_seen++; -+ -+ ob = try_alloc_bucket(trans, ca, watermark, -+ alloc_cursor, s, k, cl); -+ if (ob) { -+ iter.path->preserve = false; -+ break; -+ } -+ } -+ -+ if (ob || ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ ca->alloc_cursor = alloc_cursor; -+ -+ if (!ob && ret) -+ ob = ERR_PTR(ret); -+ -+ if (!ob && alloc_start > ca->mi.first_bucket) { -+ alloc_cursor = alloc_start = ca->mi.first_bucket; -+ goto again; -+ } -+ -+ return ob; -+} -+ -+/** -+ * bch_bucket_alloc - allocate a single bucket from a specific device -+ * -+ * Returns index of bucket on success, 0 on failure -+ */ -+static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, -+ struct bch_dev *ca, -+ enum bch_watermark watermark, -+ struct closure *cl, -+ struct bch_dev_usage *usage) -+{ -+ struct bch_fs *c = trans->c; -+ struct open_bucket *ob = NULL; -+ bool freespace = READ_ONCE(ca->mi.freespace_initialized); -+ u64 avail; -+ struct bucket_alloc_state s = { 0 }; -+ bool waiting = false; -+again: -+ bch2_dev_usage_read_fast(ca, usage); -+ avail = dev_buckets_free(ca, *usage, watermark); -+ -+ if (usage->d[BCH_DATA_need_discard].buckets > avail) -+ bch2_do_discards(c); -+ -+ if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) -+ bch2_do_gc_gens(c); -+ -+ if (should_invalidate_buckets(ca, *usage)) -+ bch2_do_invalidates(c); -+ -+ if (!avail) { -+ if (cl && !waiting) { -+ closure_wait(&c->freelist_wait, cl); -+ waiting = true; -+ goto again; -+ } -+ -+ if (!c->blocked_allocate) -+ c->blocked_allocate = local_clock(); -+ -+ ob = ERR_PTR(-BCH_ERR_freelist_empty); -+ goto err; -+ } -+ -+ if (waiting) -+ closure_wake_up(&c->freelist_wait); -+alloc: -+ ob = likely(freespace) -+ ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) -+ : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); -+ -+ if (s.skipped_need_journal_commit * 2 > avail) -+ bch2_journal_flush_async(&c->journal, NULL); -+ -+ if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { -+ freespace = false; -+ goto alloc; -+ } -+err: -+ if (!ob) -+ ob = ERR_PTR(-BCH_ERR_no_buckets_found); -+ -+ if (!IS_ERR(ob)) -+ trace_and_count(c, bucket_alloc, ca, -+ bch2_watermarks[watermark], -+ ob->bucket, -+ usage->d[BCH_DATA_free].buckets, -+ avail, -+ bch2_copygc_wait_amount(c), -+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), -+ &s, -+ cl == NULL, -+ ""); -+ else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) -+ trace_and_count(c, bucket_alloc_fail, ca, -+ bch2_watermarks[watermark], -+ 0, -+ usage->d[BCH_DATA_free].buckets, -+ avail, -+ bch2_copygc_wait_amount(c), -+ c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), -+ &s, -+ cl == NULL, -+ bch2_err_str(PTR_ERR(ob))); -+ -+ return ob; -+} -+ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_watermark watermark, -+ struct closure *cl) -+{ -+ struct bch_dev_usage usage; -+ struct open_bucket *ob; -+ -+ bch2_trans_do(c, NULL, NULL, 0, -+ PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, -+ cl, &usage))); -+ return ob; -+} -+ -+static int __dev_stripe_cmp(struct dev_stripe_state *stripe, -+ unsigned l, unsigned r) -+{ -+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - -+ (stripe->next_alloc[l] < stripe->next_alloc[r])); -+} -+ -+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs) -+{ -+ struct dev_alloc_list ret = { .nr = 0 }; -+ unsigned i; -+ -+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) -+ ret.devs[ret.nr++] = i; -+ -+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); -+ return ret; -+} -+ -+static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, -+ struct dev_stripe_state *stripe, -+ struct bch_dev_usage *usage) -+{ -+ u64 *v = stripe->next_alloc + ca->dev_idx; -+ u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); -+ u64 free_space_inv = free_space -+ ? div64_u64(1ULL << 48, free_space) -+ : 1ULL << 48; -+ u64 scale = *v / 4; -+ -+ if (*v + free_space_inv >= *v) -+ *v += free_space_inv; -+ else -+ *v = U64_MAX; -+ -+ for (v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -+ *v = *v < scale ? 0 : *v - scale; -+} -+ -+void bch2_dev_stripe_increment(struct bch_dev *ca, -+ struct dev_stripe_state *stripe) -+{ -+ struct bch_dev_usage usage; -+ -+ bch2_dev_usage_read_fast(ca, &usage); -+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -+} -+ -+static int add_new_bucket(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ struct open_bucket *ob) -+{ -+ unsigned durability = -+ bch_dev_bkey_exists(c, ob->dev)->mi.durability; -+ -+ BUG_ON(*nr_effective >= nr_replicas); -+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); -+ -+ __clear_bit(ob->dev, devs_may_alloc->d); -+ *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) -+ ? durability : 1; -+ *have_cache |= !durability; -+ -+ ob_push(c, ptrs, ob); -+ -+ if (*nr_effective >= nr_replicas) -+ return 1; -+ if (ob->ec) -+ return 1; -+ return 0; -+} -+ -+int bch2_bucket_alloc_set_trans(struct btree_trans *trans, -+ struct open_buckets *ptrs, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ enum bch_data_type data_type, -+ enum bch_watermark watermark, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct dev_alloc_list devs_sorted = -+ bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ unsigned dev; -+ struct bch_dev *ca; -+ int ret = -BCH_ERR_insufficient_devices; -+ unsigned i; -+ -+ BUG_ON(*nr_effective >= nr_replicas); -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ struct bch_dev_usage usage; -+ struct open_bucket *ob; -+ -+ dev = devs_sorted.devs[i]; -+ -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ if (!ca) -+ continue; -+ -+ if (!ca->mi.durability && *have_cache) { -+ percpu_ref_put(&ca->ref); -+ continue; -+ } -+ -+ ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); -+ if (!IS_ERR(ob)) -+ bch2_dev_stripe_increment_inlined(ca, stripe, &usage); -+ percpu_ref_put(&ca->ref); -+ -+ if (IS_ERR(ob)) { -+ ret = PTR_ERR(ob); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) -+ break; -+ continue; -+ } -+ -+ ob->data_type = data_type; -+ -+ if (add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_replicas, nr_effective, -+ have_cache, flags, ob)) { -+ ret = 0; -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+/* Allocate from stripes: */ -+ -+/* -+ * if we can't allocate a new stripe because there are already too many -+ * partially filled stripes, force allocating from an existing stripe even when -+ * it's to a device we don't want: -+ */ -+ -+static int bucket_alloc_from_stripe(struct btree_trans *trans, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ u16 target, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum bch_watermark watermark, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct dev_alloc_list devs_sorted; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ struct bch_dev *ca; -+ unsigned i, ec_idx; -+ int ret = 0; -+ -+ if (nr_replicas < 2) -+ return 0; -+ -+ if (ec_open_bucket(c, ptrs)) -+ return 0; -+ -+ h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); -+ if (IS_ERR(h)) -+ return PTR_ERR(h); -+ if (!h) -+ return 0; -+ -+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ -+ for (i = 0; i < devs_sorted.nr; i++) -+ for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { -+ if (!h->s->blocks[ec_idx]) -+ continue; -+ -+ ob = c->open_buckets + h->s->blocks[ec_idx]; -+ if (ob->dev == devs_sorted.devs[i] && -+ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) -+ goto got_bucket; -+ } -+ goto out_put_head; -+got_bucket: -+ ca = bch_dev_bkey_exists(c, ob->dev); -+ -+ ob->ec_idx = ec_idx; -+ ob->ec = h->s; -+ ec_stripe_new_get(h->s, STRIPE_REF_io); -+ -+ ret = add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_replicas, nr_effective, -+ have_cache, flags, ob); -+out_put_head: -+ bch2_ec_stripe_head_put(c, h); -+ return ret; -+} -+ -+/* Sector allocator */ -+ -+static bool want_bucket(struct bch_fs *c, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ bool *have_cache, bool ec, -+ struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ -+ if (!test_bit(ob->dev, devs_may_alloc->d)) -+ return false; -+ -+ if (ob->data_type != wp->data_type) -+ return false; -+ -+ if (!ca->mi.durability && -+ (wp->data_type == BCH_DATA_btree || ec || *have_cache)) -+ return false; -+ -+ if (ec != (ob->ec != NULL)) -+ return false; -+ -+ return true; -+} -+ -+static int bucket_alloc_set_writepoint(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ bool ec, unsigned flags) -+{ -+ struct open_buckets ptrs_skip = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ int ret = 0; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ if (!ret && want_bucket(c, wp, devs_may_alloc, -+ have_cache, ec, ob)) -+ ret = add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_replicas, nr_effective, -+ have_cache, flags, ob); -+ else -+ ob_push(c, &ptrs_skip, ob); -+ } -+ wp->ptrs = ptrs_skip; -+ -+ return ret; -+} -+ -+static int bucket_alloc_set_partial(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, bool ec, -+ enum bch_watermark watermark, -+ unsigned flags) -+{ -+ int i, ret = 0; -+ -+ if (!c->open_buckets_partial_nr) -+ return 0; -+ -+ spin_lock(&c->freelist_lock); -+ -+ if (!c->open_buckets_partial_nr) -+ goto unlock; -+ -+ for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { -+ struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; -+ -+ if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ struct bch_dev_usage usage; -+ u64 avail; -+ -+ bch2_dev_usage_read_fast(ca, &usage); -+ avail = dev_buckets_free(ca, usage, watermark); -+ if (!avail) -+ continue; -+ -+ array_remove_item(c->open_buckets_partial, -+ c->open_buckets_partial_nr, -+ i); -+ ob->on_partial_list = false; -+ -+ ret = add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_replicas, nr_effective, -+ have_cache, flags, ob); -+ if (ret) -+ break; -+ } -+ } -+unlock: -+ spin_unlock(&c->freelist_lock); -+ return ret; -+} -+ -+static int __open_bucket_add_buckets(struct btree_trans *trans, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_list *devs_have, -+ u16 target, -+ bool erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum bch_watermark watermark, -+ unsigned flags, -+ struct closure *_cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ struct closure *cl = NULL; -+ unsigned i; -+ int ret; -+ -+ devs = target_rw_devs(c, wp->data_type, target); -+ -+ /* Don't allocate from devices we already have pointers to: */ -+ for (i = 0; i < devs_have->nr; i++) -+ __clear_bit(devs_have->devs[i], devs.d); -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ __clear_bit(ob->dev, devs.d); -+ -+ if (erasure_code && ec_open_bucket(c, ptrs)) -+ return 0; -+ -+ ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, erasure_code, flags); -+ if (ret) -+ return ret; -+ -+ ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, erasure_code, watermark, flags); -+ if (ret) -+ return ret; -+ -+ if (erasure_code) { -+ ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, -+ target, -+ nr_replicas, nr_effective, -+ have_cache, -+ watermark, flags, _cl); -+ } else { -+retry_blocking: -+ /* -+ * Try nonblocking first, so that if one device is full we'll try from -+ * other devices: -+ */ -+ ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, -+ nr_replicas, nr_effective, have_cache, -+ flags, wp->data_type, watermark, cl); -+ if (ret && -+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && -+ !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && -+ !cl && _cl) { -+ cl = _cl; -+ goto retry_blocking; -+ } -+ } -+ -+ return ret; -+} -+ -+static int open_bucket_add_buckets(struct btree_trans *trans, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_list *devs_have, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum bch_watermark watermark, -+ unsigned flags, -+ struct closure *cl) -+{ -+ int ret; -+ -+ if (erasure_code) { -+ ret = __open_bucket_add_buckets(trans, ptrs, wp, -+ devs_have, target, erasure_code, -+ nr_replicas, nr_effective, have_cache, -+ watermark, flags, cl); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -+ bch2_err_matches(ret, BCH_ERR_operation_blocked) || -+ bch2_err_matches(ret, BCH_ERR_freelist_empty) || -+ bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) -+ return ret; -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ -+ ret = __open_bucket_add_buckets(trans, ptrs, wp, -+ devs_have, target, false, -+ nr_replicas, nr_effective, have_cache, -+ watermark, flags, cl); -+ return ret < 0 ? ret : 0; -+} -+ -+/** -+ * should_drop_bucket - check if this is open_bucket should go away -+ * @ca: if set, we're killing buckets for a particular device -+ * @ec: if true, we're shutting down erasure coding and killing all ec -+ * open_buckets -+ * otherwise, return true -+ * -+ * We're killing open_buckets because we're shutting down a device, erasure -+ * coding, or the entire filesystem - check if this open_bucket matches: -+ */ -+static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, -+ struct bch_dev *ca, bool ec) -+{ -+ if (ec) { -+ return ob->ec != NULL; -+ } else if (ca) { -+ bool drop = ob->dev == ca->dev_idx; -+ struct open_bucket *ob2; -+ unsigned i; -+ -+ if (!drop && ob->ec) { -+ unsigned nr_blocks; -+ -+ mutex_lock(&ob->ec->lock); -+ nr_blocks = bkey_i_to_stripe(&ob->ec->new_stripe.key)->v.nr_blocks; -+ -+ for (i = 0; i < nr_blocks; i++) { -+ if (!ob->ec->blocks[i]) -+ continue; -+ -+ ob2 = c->open_buckets + ob->ec->blocks[i]; -+ drop |= ob2->dev == ca->dev_idx; -+ } -+ mutex_unlock(&ob->ec->lock); -+ } -+ -+ return drop; -+ } else { -+ return true; -+ } -+} -+ -+static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, -+ bool ec, struct write_point *wp) -+{ -+ struct open_buckets ptrs = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&wp->lock); -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (should_drop_bucket(ob, c, ca, ec)) -+ bch2_open_bucket_put(c, ob); -+ else -+ ob_push(c, &ptrs, ob); -+ wp->ptrs = ptrs; -+ mutex_unlock(&wp->lock); -+} -+ -+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, -+ bool ec) -+{ -+ unsigned i; -+ -+ /* Next, close write points that point to this device... */ -+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) -+ bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); -+ -+ bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); -+ bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); -+ bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ while (c->btree_reserve_cache_nr) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ bch2_open_buckets_put(c, &a->ob); -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+ spin_lock(&c->freelist_lock); -+ i = 0; -+ while (i < c->open_buckets_partial_nr) { -+ struct open_bucket *ob = -+ c->open_buckets + c->open_buckets_partial[i]; -+ -+ if (should_drop_bucket(ob, c, ca, ec)) { -+ --c->open_buckets_partial_nr; -+ swap(c->open_buckets_partial[i], -+ c->open_buckets_partial[c->open_buckets_partial_nr]); -+ ob->on_partial_list = false; -+ spin_unlock(&c->freelist_lock); -+ bch2_open_bucket_put(c, ob); -+ spin_lock(&c->freelist_lock); -+ } else { -+ i++; -+ } -+ } -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_ec_stop_dev(c, ca); -+} -+ -+static inline struct hlist_head *writepoint_hash(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ unsigned hash = -+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); -+ -+ return &c->write_points_hash[hash]; -+} -+ -+static struct write_point *__writepoint_find(struct hlist_head *head, -+ unsigned long write_point) -+{ -+ struct write_point *wp; -+ -+ rcu_read_lock(); -+ hlist_for_each_entry_rcu(wp, head, node) -+ if (wp->write_point == write_point) -+ goto out; -+ wp = NULL; -+out: -+ rcu_read_unlock(); -+ return wp; -+} -+ -+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -+{ -+ u64 stranded = c->write_points_nr * c->bucket_size_max; -+ u64 free = bch2_fs_usage_read_short(c).free; -+ -+ return stranded * factor > free; -+} -+ -+static bool try_increase_writepoints(struct bch_fs *c) -+{ -+ struct write_point *wp; -+ -+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || -+ too_many_writepoints(c, 32)) -+ return false; -+ -+ wp = c->write_points + c->write_points_nr++; -+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); -+ return true; -+} -+ -+static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) -+{ -+ struct bch_fs *c = trans->c; -+ struct write_point *wp; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&c->write_points_hash_lock); -+ if (c->write_points_nr < old_nr) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return true; -+ } -+ -+ if (c->write_points_nr == 1 || -+ !too_many_writepoints(c, 8)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return false; -+ } -+ -+ wp = c->write_points + --c->write_points_nr; -+ -+ hlist_del_rcu(&wp->node); -+ mutex_unlock(&c->write_points_hash_lock); -+ -+ bch2_trans_mutex_lock_norelock(trans, &wp->lock); -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_free_unused(c, ob); -+ wp->ptrs.nr = 0; -+ mutex_unlock(&wp->lock); -+ return true; -+} -+ -+static struct write_point *writepoint_find(struct btree_trans *trans, -+ unsigned long write_point) -+{ -+ struct bch_fs *c = trans->c; -+ struct write_point *wp, *oldest; -+ struct hlist_head *head; -+ -+ if (!(write_point & 1UL)) { -+ wp = (struct write_point *) write_point; -+ bch2_trans_mutex_lock_norelock(trans, &wp->lock); -+ return wp; -+ } -+ -+ head = writepoint_hash(c, write_point); -+restart_find: -+ wp = __writepoint_find(head, write_point); -+ if (wp) { -+lock_wp: -+ bch2_trans_mutex_lock_norelock(trans, &wp->lock); -+ if (wp->write_point == write_point) -+ goto out; -+ mutex_unlock(&wp->lock); -+ goto restart_find; -+ } -+restart_find_oldest: -+ oldest = NULL; -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) -+ if (!oldest || time_before64(wp->last_used, oldest->last_used)) -+ oldest = wp; -+ -+ bch2_trans_mutex_lock_norelock(trans, &oldest->lock); -+ bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); -+ if (oldest >= c->write_points + c->write_points_nr || -+ try_increase_writepoints(c)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto restart_find_oldest; -+ } -+ -+ wp = __writepoint_find(head, write_point); -+ if (wp && wp != oldest) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto lock_wp; -+ } -+ -+ wp = oldest; -+ hlist_del_rcu(&wp->node); -+ wp->write_point = write_point; -+ hlist_add_head_rcu(&wp->node, head); -+ mutex_unlock(&c->write_points_hash_lock); -+out: -+ wp->last_used = local_clock(); -+ return wp; -+} -+ -+/* -+ * Get us an open_bucket we can allocate from, return with it locked: -+ */ -+int bch2_alloc_sectors_start_trans(struct btree_trans *trans, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum bch_watermark watermark, -+ unsigned flags, -+ struct closure *cl, -+ struct write_point **wp_ret) -+{ -+ struct bch_fs *c = trans->c; -+ struct write_point *wp; -+ struct open_bucket *ob; -+ struct open_buckets ptrs; -+ unsigned nr_effective, write_points_nr; -+ bool have_cache; -+ int ret; -+ int i; -+ -+ BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); -+ -+ BUG_ON(!nr_replicas || !nr_replicas_required); -+retry: -+ ptrs.nr = 0; -+ nr_effective = 0; -+ write_points_nr = c->write_points_nr; -+ have_cache = false; -+ -+ *wp_ret = wp = writepoint_find(trans, write_point.v); -+ -+ /* metadata may not allocate on cache devices: */ -+ if (wp->data_type != BCH_DATA_user) -+ have_cache = true; -+ -+ if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, watermark, -+ flags, NULL); -+ if (!ret || -+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto alloc_done; -+ -+ /* Don't retry from all devices if we're out of open buckets: */ -+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) -+ goto allocate_blocking; -+ -+ /* -+ * Only try to allocate cache (durability = 0 devices) from the -+ * specified target: -+ */ -+ have_cache = true; -+ -+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -+ 0, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, watermark, -+ flags, cl); -+ } else { -+allocate_blocking: -+ ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, watermark, -+ flags, cl); -+ } -+alloc_done: -+ BUG_ON(!ret && nr_effective < nr_replicas); -+ -+ if (erasure_code && !ec_open_bucket(c, &ptrs)) -+ pr_debug("failed to get ec bucket: ret %u", ret); -+ -+ if (ret == -BCH_ERR_insufficient_devices && -+ nr_effective >= nr_replicas_required) -+ ret = 0; -+ -+ if (ret) -+ goto err; -+ -+ /* Free buckets we didn't use: */ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_free_unused(c, ob); -+ -+ wp->ptrs = ptrs; -+ -+ wp->sectors_free = UINT_MAX; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); -+ -+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); -+ -+ return 0; -+err: -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) -+ ob_push(c, &ptrs, ob); -+ else -+ open_bucket_free_unused(c, ob); -+ wp->ptrs = ptrs; -+ -+ mutex_unlock(&wp->lock); -+ -+ if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && -+ try_decrease_writepoints(trans, write_points_nr)) -+ goto retry; -+ -+ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || -+ bch2_err_matches(ret, BCH_ERR_freelist_empty)) -+ return cl -+ ? -BCH_ERR_bucket_alloc_blocked -+ : -BCH_ERR_ENOSPC_bucket_alloc; -+ -+ return ret; -+} -+ -+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ -+ return (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = ob->gen, -+ .dev = ob->dev, -+ .offset = bucket_to_sector(ca, ob->bucket) + -+ ca->mi.bucket_size - -+ ob->sectors_free, -+ }; -+} -+ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors, -+ bool cached) -+{ -+ bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -+{ -+ bch2_alloc_sectors_done_inlined(c, wp); -+} -+ -+static inline void writepoint_init(struct write_point *wp, -+ enum bch_data_type type) -+{ -+ mutex_init(&wp->lock); -+ wp->data_type = type; -+ -+ INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); -+ INIT_LIST_HEAD(&wp->writes); -+ spin_lock_init(&wp->writes_lock); -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ struct write_point *wp; -+ -+ mutex_init(&c->write_points_hash_lock); -+ c->write_points_nr = ARRAY_SIZE(c->write_points); -+ -+ /* open bucket 0 is a sentinal NULL: */ -+ spin_lock_init(&c->open_buckets[0].lock); -+ -+ for (ob = c->open_buckets + 1; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { -+ spin_lock_init(&ob->lock); -+ c->open_buckets_nr_free++; -+ -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ } -+ -+ writepoint_init(&c->btree_write_point, BCH_DATA_btree); -+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); -+ writepoint_init(&c->copygc_write_point, BCH_DATA_user); -+ -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) { -+ writepoint_init(wp, BCH_DATA_user); -+ -+ wp->last_used = local_clock(); -+ wp->write_point = (unsigned long) wp; -+ hlist_add_head_rcu(&wp->node, -+ writepoint_hash(c, wp->write_point)); -+ } -+} -+ -+static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ unsigned data_type = ob->data_type; -+ barrier(); /* READ_ONCE() doesn't work on bitfields */ -+ -+ prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", -+ ob - c->open_buckets, -+ atomic_read(&ob->pin), -+ data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", -+ ob->dev, ob->bucket, ob->gen, -+ ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); -+ if (ob->ec) -+ prt_printf(out, " ec idx %llu", ob->ec->idx); -+ if (ob->on_partial_list) -+ prt_str(out, " partial"); -+ prt_newline(out); -+} -+ -+void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ out->atomic++; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list) -+ bch2_open_bucket_to_text(out, c, ob); -+ spin_unlock(&ob->lock); -+ } -+ -+ --out->atomic; -+} -+ -+void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ unsigned i; -+ -+ out->atomic++; -+ spin_lock(&c->freelist_lock); -+ -+ for (i = 0; i < c->open_buckets_partial_nr; i++) -+ bch2_open_bucket_to_text(out, c, -+ c->open_buckets + c->open_buckets_partial[i]); -+ -+ spin_unlock(&c->freelist_lock); -+ --out->atomic; -+} -+ -+static const char * const bch2_write_point_states[] = { -+#define x(n) #n, -+ WRITE_POINT_STATES() -+#undef x -+ NULL -+}; -+ -+static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, -+ struct write_point *wp) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ prt_printf(out, "%lu: ", wp->write_point); -+ prt_human_readable_u64(out, wp->sectors_allocated); -+ -+ prt_printf(out, " last wrote: "); -+ bch2_pr_time_units(out, sched_clock() - wp->last_used); -+ -+ for (i = 0; i < WRITE_POINT_STATE_NR; i++) { -+ prt_printf(out, " %s: ", bch2_write_point_states[i]); -+ bch2_pr_time_units(out, wp->time[i]); -+ } -+ -+ prt_newline(out); -+ -+ printbuf_indent_add(out, 2); -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ bch2_open_bucket_to_text(out, c, ob); -+ printbuf_indent_sub(out, 2); -+} -+ -+void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct write_point *wp; -+ -+ prt_str(out, "Foreground write points\n"); -+ for (wp = c->write_points; -+ wp < c->write_points + ARRAY_SIZE(c->write_points); -+ wp++) -+ bch2_write_point_to_text(out, c, wp); -+ -+ prt_str(out, "Copygc write point\n"); -+ bch2_write_point_to_text(out, c, &c->copygc_write_point); -+ -+ prt_str(out, "Rebalance write point\n"); -+ bch2_write_point_to_text(out, c, &c->rebalance_write_point); -+ -+ prt_str(out, "Btree write point\n"); -+ bch2_write_point_to_text(out, c, &c->btree_write_point); -+} -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -new file mode 100644 -index 000000000..7aaeec44c ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,224 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -+#define _BCACHEFS_ALLOC_FOREGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "extents.h" -+#include "sb-members.h" -+ -+#include -+ -+struct bkey; -+struct bch_dev; -+struct bch_fs; -+struct bch_devs_List; -+ -+extern const char * const bch2_watermarks[]; -+ -+void bch2_reset_alloc_cursors(struct bch_fs *); -+ -+struct dev_alloc_list { -+ unsigned nr; -+ u8 devs[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -+ struct dev_stripe_state *, -+ struct bch_devs_mask *); -+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -+ -+long bch2_bucket_alloc_new_fs(struct bch_dev *); -+ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, -+ enum bch_watermark, struct closure *); -+ -+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, -+ struct open_bucket *ob) -+{ -+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); -+ -+ obs->v[obs->nr++] = ob - c->open_buckets; -+} -+ -+#define open_bucket_for_each(_c, _obs, _ob, _i) \ -+ for ((_i) = 0; \ -+ (_i) < (_obs)->nr && \ -+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ -+ (_i)++) -+ -+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, -+ struct open_buckets *obs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ec) -+ return ob; -+ -+ return NULL; -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *, -+ struct open_buckets *, unsigned); -+ -+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); -+ -+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ if (atomic_dec_and_test(&ob->pin)) -+ __bch2_open_bucket_put(c, ob); -+} -+ -+static inline void bch2_open_buckets_put(struct bch_fs *c, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ bch2_open_bucket_put(c, ob); -+ ptrs->nr = 0; -+} -+ -+static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); -+ wp->ptrs = keep; -+ -+ mutex_unlock(&wp->lock); -+ -+ bch2_open_buckets_put(c, &ptrs); -+} -+ -+static inline void bch2_open_bucket_get(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ ob->data_type = wp->data_type; -+ atomic_inc(&ob->pin); -+ ob_push(c, ptrs, ob); -+ } -+} -+ -+static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, -+ unsigned dev, u64 bucket) -+{ -+ return c->open_buckets_hash + -+ (jhash_3words(dev, bucket, bucket >> 32, 0) & -+ (OPEN_BUCKETS_COUNT - 1)); -+} -+ -+static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) -+{ -+ open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); -+ -+ while (slot) { -+ struct open_bucket *ob = &c->open_buckets[slot]; -+ -+ if (ob->dev == dev && ob->bucket == bucket) -+ return true; -+ -+ slot = ob->hash; -+ } -+ -+ return false; -+} -+ -+static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) -+{ -+ bool ret; -+ -+ if (bch2_bucket_is_open(c, dev, bucket)) -+ return true; -+ -+ spin_lock(&c->freelist_lock); -+ ret = bch2_bucket_is_open(c, dev, bucket); -+ spin_unlock(&c->freelist_lock); -+ -+ return ret; -+} -+ -+int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, -+ struct dev_stripe_state *, struct bch_devs_mask *, -+ unsigned, unsigned *, bool *, unsigned, -+ enum bch_data_type, enum bch_watermark, -+ struct closure *); -+ -+int bch2_alloc_sectors_start_trans(struct btree_trans *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum bch_watermark, -+ unsigned, -+ struct closure *, -+ struct write_point **); -+ -+struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+static inline void -+bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors, -+ bool cached) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(sectors > wp->sectors_free); -+ wp->sectors_free -= sectors; -+ wp->sectors_allocated += sectors; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); -+ -+ ptr.cached = cached || -+ (!ca->mi.durability && -+ wp->data_type == BCH_DATA_user); -+ -+ bch2_bkey_append_ptr(k, ptr); -+ -+ BUG_ON(sectors > ob->sectors_free); -+ ob->sectors_free -= sectors; -+ } -+} -+ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, -+ struct bkey_i *, unsigned, bool); -+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -+ -+void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); -+ -+static inline struct write_point_specifier writepoint_hashed(unsigned long v) -+{ -+ return (struct write_point_specifier) { .v = v | 1 }; -+} -+ -+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -+{ -+ return (struct write_point_specifier) { .v = (unsigned long) wp }; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *); -+ -+void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); -+void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -new file mode 100644 -index 000000000..b91b7a461 ---- /dev/null -+++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,126 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_TYPES_H -+#define _BCACHEFS_ALLOC_TYPES_H -+ -+#include -+#include -+ -+#include "clock_types.h" -+#include "fifo.h" -+ -+struct bucket_alloc_state { -+ u64 buckets_seen; -+ u64 skipped_open; -+ u64 skipped_need_journal_commit; -+ u64 skipped_nocow; -+ u64 skipped_nouse; -+}; -+ -+#define BCH_WATERMARKS() \ -+ x(stripe) \ -+ x(normal) \ -+ x(copygc) \ -+ x(btree) \ -+ x(btree_copygc) \ -+ x(reclaim) -+ -+enum bch_watermark { -+#define x(name) BCH_WATERMARK_##name, -+ BCH_WATERMARKS() -+#undef x -+ BCH_WATERMARK_NR, -+}; -+ -+#define BCH_WATERMARK_BITS 3 -+#define BCH_WATERMARK_MASK ~(~0U << BCH_WATERMARK_BITS) -+ -+#define OPEN_BUCKETS_COUNT 1024 -+ -+#define WRITE_POINT_HASH_NR 32 -+#define WRITE_POINT_MAX 32 -+ -+/* -+ * 0 is never a valid open_bucket_idx_t: -+ */ -+typedef u16 open_bucket_idx_t; -+ -+struct open_bucket { -+ spinlock_t lock; -+ atomic_t pin; -+ open_bucket_idx_t freelist; -+ open_bucket_idx_t hash; -+ -+ /* -+ * When an open bucket has an ec_stripe attached, this is the index of -+ * the block in the stripe this open_bucket corresponds to: -+ */ -+ u8 ec_idx; -+ enum bch_data_type data_type:6; -+ unsigned valid:1; -+ unsigned on_partial_list:1; -+ -+ u8 dev; -+ u8 gen; -+ u32 sectors_free; -+ u64 bucket; -+ struct ec_stripe_new *ec; -+}; -+ -+#define OPEN_BUCKET_LIST_MAX 15 -+ -+struct open_buckets { -+ open_bucket_idx_t nr; -+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -+}; -+ -+struct dev_stripe_state { -+ u64 next_alloc[BCH_SB_MEMBERS_MAX]; -+}; -+ -+#define WRITE_POINT_STATES() \ -+ x(stopped) \ -+ x(waiting_io) \ -+ x(waiting_work) \ -+ x(running) -+ -+enum write_point_state { -+#define x(n) WRITE_POINT_##n, -+ WRITE_POINT_STATES() -+#undef x -+ WRITE_POINT_STATE_NR -+}; -+ -+struct write_point { -+ struct { -+ struct hlist_node node; -+ struct mutex lock; -+ u64 last_used; -+ unsigned long write_point; -+ enum bch_data_type data_type; -+ -+ /* calculated based on how many pointers we're actually going to use: */ -+ unsigned sectors_free; -+ -+ struct open_buckets ptrs; -+ struct dev_stripe_state stripe; -+ -+ u64 sectors_allocated; -+ } __aligned(SMP_CACHE_BYTES); -+ -+ struct { -+ struct work_struct index_update_work; -+ -+ struct list_head writes; -+ spinlock_t writes_lock; -+ -+ enum write_point_state state; -+ u64 last_state_change; -+ u64 time[WRITE_POINT_STATE_NR]; -+ } __aligned(SMP_CACHE_BYTES); -+}; -+ -+struct write_point_specifier { -+ unsigned long v; -+}; -+ -+#endif /* _BCACHEFS_ALLOC_TYPES_H */ -diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c -new file mode 100644 -index 000000000..8747c5e19 ---- /dev/null -+++ b/fs/bcachefs/backpointers.c -@@ -0,0 +1,873 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bbpos.h" -+#include "alloc_background.h" -+#include "backpointers.h" -+#include "btree_cache.h" -+#include "btree_update.h" -+#include "btree_write_buffer.h" -+#include "error.h" -+ -+#include -+ -+static bool extent_matches_bp(struct bch_fs *c, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, -+ struct bpos bucket, -+ struct bch_backpointer bp) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ struct bpos bucket2; -+ struct bch_backpointer bp2; -+ -+ if (p.ptr.cached) -+ continue; -+ -+ bch2_extent_ptr_to_bp(c, btree_id, level, k, p, -+ &bucket2, &bp2); -+ if (bpos_eq(bucket, bucket2) && -+ !memcmp(&bp, &bp2, sizeof(bp))) -+ return true; -+ } -+ -+ return false; -+} -+ -+int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); -+ struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); -+ -+ if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { -+ prt_str(err, "backpointer at wrong pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) -+{ -+ prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", -+ bch2_btree_ids[bp->btree_id], -+ bp->level, -+ (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), -+ (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), -+ bp->bucket_len); -+ bch2_bpos_to_text(out, bp->pos); -+} -+ -+void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -+{ -+ prt_str(out, "bucket="); -+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); -+ prt_str(out, " "); -+ -+ bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); -+} -+ -+void bch2_backpointer_swab(struct bkey_s k) -+{ -+ struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); -+ -+ bp.v->bucket_offset = swab32(bp.v->bucket_offset); -+ bp.v->bucket_len = swab32(bp.v->bucket_len); -+ bch2_bpos_swab(&bp.v->pos); -+} -+ -+static noinline int backpointer_mod_err(struct btree_trans *trans, -+ struct bch_backpointer bp, -+ struct bkey_s_c bp_k, -+ struct bkey_s_c orig_k, -+ bool insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ -+ if (insert) { -+ prt_printf(&buf, "existing backpointer found when inserting "); -+ bch2_backpointer_to_text(&buf, &bp); -+ prt_newline(&buf); -+ printbuf_indent_add(&buf, 2); -+ -+ prt_printf(&buf, "found "); -+ bch2_bkey_val_to_text(&buf, c, bp_k); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "for "); -+ bch2_bkey_val_to_text(&buf, c, orig_k); -+ -+ bch_err(c, "%s", buf.buf); -+ } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { -+ prt_printf(&buf, "backpointer not found when deleting"); -+ prt_newline(&buf); -+ printbuf_indent_add(&buf, 2); -+ -+ prt_printf(&buf, "searching for "); -+ bch2_backpointer_to_text(&buf, &bp); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "got "); -+ bch2_bkey_val_to_text(&buf, c, bp_k); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "for "); -+ bch2_bkey_val_to_text(&buf, c, orig_k); -+ -+ bch_err(c, "%s", buf.buf); -+ } -+ -+ printbuf_exit(&buf); -+ -+ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { -+ bch2_inconsistent_error(c); -+ return -EIO; -+ } else { -+ return 0; -+ } -+} -+ -+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, -+ struct bkey_i_backpointer *bp_k, -+ struct bch_backpointer bp, -+ struct bkey_s_c orig_k, -+ bool insert) -+{ -+ struct btree_iter bp_iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, -+ bp_k->k.p, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_WITH_UPDATES); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (insert -+ ? k.k->type -+ : (k.k->type != KEY_TYPE_backpointer || -+ memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { -+ ret = backpointer_mod_err(trans, bp, k, orig_k, insert); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); -+err: -+ bch2_trans_iter_exit(trans, &bp_iter); -+ return ret; -+} -+ -+/* -+ * Find the next backpointer >= *bp_offset: -+ */ -+int bch2_get_next_backpointer(struct btree_trans *trans, -+ struct bpos bucket, int gen, -+ struct bpos *bp_pos, -+ struct bch_backpointer *bp, -+ unsigned iter_flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); -+ struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (bpos_ge(*bp_pos, bp_end_pos)) -+ goto done; -+ -+ if (gen >= 0) { -+ k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, -+ bucket, BTREE_ITER_CACHED|iter_flags); -+ ret = bkey_err(k); -+ if (ret) -+ goto out; -+ -+ if (k.k->type != KEY_TYPE_alloc_v4 || -+ bkey_s_c_to_alloc_v4(k).v->gen != gen) -+ goto done; -+ } -+ -+ *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); -+ -+ for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, -+ *bp_pos, iter_flags, k, ret) { -+ if (bpos_ge(k.k->p, bp_end_pos)) -+ break; -+ -+ *bp_pos = k.k->p; -+ *bp = *bkey_s_c_to_backpointer(k).v; -+ goto out; -+ } -+done: -+ *bp_pos = SPOS_MAX; -+out: -+ bch2_trans_iter_exit(trans, &bp_iter); -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ return ret; -+} -+ -+static void backpointer_not_found(struct btree_trans *trans, -+ struct bpos bp_pos, -+ struct bch_backpointer bp, -+ struct bkey_s_c k, -+ const char *thing_it_points_to) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos); -+ -+ if (likely(!bch2_backpointers_no_use_write_buffer)) -+ return; -+ -+ prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", -+ thing_it_points_to); -+ prt_printf(&buf, "bucket: "); -+ bch2_bpos_to_text(&buf, bucket); -+ prt_printf(&buf, "\n "); -+ -+ prt_printf(&buf, "backpointer pos: "); -+ bch2_bpos_to_text(&buf, bp_pos); -+ prt_printf(&buf, "\n "); -+ -+ bch2_backpointer_to_text(&buf, &bp); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k); -+ if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) -+ bch_err_ratelimited(c, "%s", buf.buf); -+ else -+ bch2_trans_inconsistent(trans, "%s", buf.buf); -+ -+ printbuf_exit(&buf); -+} -+ -+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos bp_pos, -+ struct bch_backpointer bp, -+ unsigned iter_flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); -+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos); -+ struct bkey_s_c k; -+ -+ bch2_trans_node_iter_init(trans, iter, -+ bp.btree_id, -+ bp.pos, -+ 0, -+ min(bp.level, r->level), -+ iter_flags); -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) { -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+ } -+ -+ if (bp.level == r->level + 1) -+ k = bkey_i_to_s_c(&r->key); -+ -+ if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) -+ return k; -+ -+ bch2_trans_iter_exit(trans, iter); -+ -+ if (unlikely(bch2_backpointers_no_use_write_buffer)) { -+ if (bp.level) { -+ struct btree *b; -+ -+ /* -+ * If a backpointer for a btree node wasn't found, it may be -+ * because it was overwritten by a new btree node that hasn't -+ * been written out yet - backpointer_get_node() checks for -+ * this: -+ */ -+ b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); -+ if (!IS_ERR_OR_NULL(b)) -+ return bkey_i_to_s_c(&b->key); -+ -+ bch2_trans_iter_exit(trans, iter); -+ -+ if (IS_ERR(b)) -+ return bkey_s_c_err(PTR_ERR(b)); -+ return bkey_s_c_null; -+ } -+ -+ backpointer_not_found(trans, bp_pos, bp, k, "extent"); -+ } -+ -+ return bkey_s_c_null; -+} -+ -+struct btree *bch2_backpointer_get_node(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos bp_pos, -+ struct bch_backpointer bp) -+{ -+ struct bch_fs *c = trans->c; -+ struct bpos bucket = bp_pos_to_bucket(c, bp_pos); -+ struct btree *b; -+ -+ BUG_ON(!bp.level); -+ -+ bch2_trans_node_iter_init(trans, iter, -+ bp.btree_id, -+ bp.pos, -+ 0, -+ bp.level - 1, -+ 0); -+ b = bch2_btree_iter_peek_node(iter); -+ if (IS_ERR(b)) -+ goto err; -+ -+ if (b && extent_matches_bp(c, bp.btree_id, bp.level, -+ bkey_i_to_s_c(&b->key), -+ bucket, bp)) -+ return b; -+ -+ if (b && btree_node_will_make_reachable(b)) { -+ b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); -+ } else { -+ backpointer_not_found(trans, bp_pos, bp, -+ bkey_i_to_s_c(&b->key), "btree node"); -+ b = NULL; -+ } -+err: -+ bch2_trans_iter_exit(trans, iter); -+ return b; -+} -+ -+static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter alloc_iter = { NULL }; -+ struct bch_dev *ca; -+ struct bkey_s_c alloc_k; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, -+ "backpointer for mising device:\n%s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, bp_iter, 0); -+ goto out; -+ } -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ -+ alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, -+ bp_pos_to_bucket(c, k.k->p), 0); -+ ret = bkey_err(alloc_k); -+ if (ret) -+ goto out; -+ -+ if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, -+ "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", -+ alloc_iter.pos.inode, alloc_iter.pos.offset, -+ (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, bp_iter, 0); -+ goto out; -+ } -+out: -+fsck_err: -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* verify that every backpointer has a corresponding alloc key */ -+int bch2_check_btree_backpointers(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_backpointers, POS_MIN, 0, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ bch2_check_btree_backpointer(&trans, &iter, k))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+struct bpos_level { -+ unsigned level; -+ struct bpos pos; -+}; -+ -+static int check_bp_exists(struct btree_trans *trans, -+ struct bpos bucket, -+ struct bch_backpointer bp, -+ struct bkey_s_c orig_k, -+ struct bpos bucket_start, -+ struct bpos bucket_end, -+ struct bpos_level *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter bp_iter = { NULL }; -+ struct printbuf buf = PRINTBUF; -+ struct bkey_s_c bp_k; -+ int ret; -+ -+ if (bpos_lt(bucket, bucket_start) || -+ bpos_gt(bucket, bucket_end)) -+ return 0; -+ -+ if (!bch2_dev_bucket_exists(c, bucket)) -+ goto missing; -+ -+ bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp(c, bucket, bp.bucket_offset), -+ 0); -+ ret = bkey_err(bp_k); -+ if (ret) -+ goto err; -+ -+ if (bp_k.k->type != KEY_TYPE_backpointer || -+ memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { -+ if (last_flushed->level != bp.level || -+ !bpos_eq(last_flushed->pos, orig_k.k->p)) { -+ last_flushed->level = bp.level; -+ last_flushed->pos = orig_k.k->p; -+ -+ ret = bch2_btree_write_buffer_flush_sync(trans) ?: -+ -BCH_ERR_transaction_restart_write_buffer_flush; -+ goto out; -+ } -+ goto missing; -+ } -+out: -+err: -+fsck_err: -+ bch2_trans_iter_exit(trans, &bp_iter); -+ printbuf_exit(&buf); -+ return ret; -+missing: -+ prt_printf(&buf, "missing backpointer for btree=%s l=%u ", -+ bch2_btree_ids[bp.btree_id], bp.level); -+ bch2_bkey_val_to_text(&buf, c, orig_k); -+ prt_printf(&buf, "\nbp pos "); -+ bch2_bpos_to_text(&buf, bp_iter.pos); -+ -+ if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers || -+ c->opts.reconstruct_alloc || -+ fsck_err(c, "%s", buf.buf)) -+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); -+ -+ goto out; -+} -+ -+static int check_extent_to_backpointers(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos bucket_start, -+ struct bpos bucket_end, -+ struct bpos_level *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_btree_iter_peek_all_levels(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ if (!k.k) -+ return 0; -+ -+ ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ struct bpos bucket_pos; -+ struct bch_backpointer bp; -+ -+ if (p.ptr.cached) -+ continue; -+ -+ bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, -+ k, p, &bucket_pos, &bp); -+ -+ ret = check_bp_exists(trans, bucket_pos, bp, k, -+ bucket_start, bucket_end, -+ last_flushed); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int check_btree_root_to_backpointers(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos bucket_start, -+ struct bpos bucket_end, -+ struct bpos_level *last_flushed) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_root *r = bch2_btree_id_root(c, btree_id); -+ struct btree_iter iter; -+ struct btree *b; -+ struct bkey_s_c k; -+ struct bkey_ptrs_c ptrs; -+ struct extent_ptr_decoded p; -+ const union bch_extent_entry *entry; -+ int ret; -+ -+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); -+ b = bch2_btree_iter_peek_node(&iter); -+ ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto err; -+ -+ BUG_ON(b != btree_node_root(c, b)); -+ -+ k = bkey_i_to_s_c(&b->key); -+ ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ struct bpos bucket_pos; -+ struct bch_backpointer bp; -+ -+ if (p.ptr.cached) -+ continue; -+ -+ bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, -+ k, p, &bucket_pos, &bp); -+ -+ ret = check_bp_exists(trans, bucket_pos, bp, k, -+ bucket_start, bucket_end, -+ last_flushed); -+ if (ret) -+ goto err; -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -+{ -+ return (struct bbpos) { -+ .btree = bp.btree_id, -+ .pos = bp.pos, -+ }; -+} -+ -+static size_t btree_nodes_fit_in_ram(struct bch_fs *c) -+{ -+ struct sysinfo i; -+ u64 mem_bytes; -+ -+ si_meminfo(&i); -+ mem_bytes = i.totalram * i.mem_unit; -+ return div_u64(mem_bytes >> 1, btree_bytes(c)); -+} -+ -+static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, -+ unsigned btree_leaf_mask, -+ unsigned btree_interior_mask, -+ struct bbpos start, struct bbpos *end) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); -+ enum btree_id btree; -+ int ret = 0; -+ -+ for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { -+ unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; -+ -+ if (!((1U << btree) & btree_leaf_mask) && -+ !((1U << btree) & btree_interior_mask)) -+ continue; -+ -+ bch2_trans_node_iter_init(trans, &iter, btree, -+ btree == start.btree ? start.pos : POS_MIN, -+ 0, depth, 0); -+ /* -+ * for_each_btree_key_contineu() doesn't check the return value -+ * from bch2_btree_iter_advance(), which is needed when -+ * iterating over interior nodes where we'll see keys at -+ * SPOS_MAX: -+ */ -+ do { -+ k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); -+ ret = bkey_err(k); -+ if (!k.k || ret) -+ break; -+ -+ --btree_nodes; -+ if (!btree_nodes) { -+ *end = BBPOS(btree, k.k->p); -+ bch2_trans_iter_exit(trans, &iter); -+ return 0; -+ } -+ } while (bch2_btree_iter_advance(&iter)); -+ bch2_trans_iter_exit(trans, &iter); -+ } -+ -+ *end = BBPOS_MAX; -+ return ret; -+} -+ -+static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, -+ struct bpos bucket_start, -+ struct bpos bucket_end) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ enum btree_id btree_id; -+ struct bpos_level last_flushed = { UINT_MAX }; -+ int ret = 0; -+ -+ for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { -+ unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; -+ -+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, -+ depth, -+ BTREE_ITER_ALL_LEVELS| -+ BTREE_ITER_PREFETCH); -+ -+ do { -+ ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_extent_to_backpointers(trans, &iter, -+ bucket_start, bucket_end, -+ &last_flushed)); -+ if (ret) -+ break; -+ } while (!bch2_btree_iter_advance(&iter)); -+ -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) -+ break; -+ -+ ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ check_btree_root_to_backpointers(trans, btree_id, -+ bucket_start, bucket_end, -+ &last_flushed)); -+ if (ret) -+ break; -+ } -+ return ret; -+} -+ -+static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, -+ struct bpos bucket) -+{ -+ return bch2_dev_exists2(c, bucket.inode) -+ ? bucket_pos_to_bp(c, bucket, 0) -+ : bucket; -+} -+ -+static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, -+ struct bpos start, struct bpos *end) -+{ -+ struct btree_iter alloc_iter; -+ struct btree_iter bp_iter; -+ struct bkey_s_c alloc_k, bp_k; -+ size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); -+ bool alloc_end = false, bp_end = false; -+ int ret = 0; -+ -+ bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, -+ start, 0, 1, 0); -+ bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, -+ bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); -+ while (1) { -+ alloc_k = !alloc_end -+ ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) -+ : bkey_s_c_null; -+ bp_k = !bp_end -+ ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) -+ : bkey_s_c_null; -+ -+ ret = bkey_err(alloc_k) ?: bkey_err(bp_k); -+ if ((!alloc_k.k && !bp_k.k) || ret) { -+ *end = SPOS_MAX; -+ break; -+ } -+ -+ --btree_nodes; -+ if (!btree_nodes) { -+ *end = alloc_k.k->p; -+ break; -+ } -+ -+ if (bpos_lt(alloc_iter.pos, SPOS_MAX) && -+ bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { -+ if (!bch2_btree_iter_advance(&alloc_iter)) -+ alloc_end = true; -+ } else { -+ if (!bch2_btree_iter_advance(&bp_iter)) -+ bp_end = true; -+ } -+ } -+ bch2_trans_iter_exit(trans, &bp_iter); -+ bch2_trans_iter_exit(trans, &alloc_iter); -+ return ret; -+} -+ -+int bch2_check_extents_to_backpointers(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct bpos start = POS_MIN, end; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ while (1) { -+ ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); -+ if (ret) -+ break; -+ -+ if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) -+ bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", -+ __func__, btree_nodes_fit_in_ram(c)); -+ -+ if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "check_extents_to_backpointers(): "); -+ bch2_bpos_to_text(&buf, start); -+ prt_str(&buf, "-"); -+ bch2_bpos_to_text(&buf, end); -+ -+ bch_verbose(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ -+ ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); -+ if (ret || bpos_eq(end, SPOS_MAX)) -+ break; -+ -+ start = bpos_successor(end); -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int check_one_backpointer(struct btree_trans *trans, -+ struct bbpos start, -+ struct bbpos end, -+ struct bkey_s_c_backpointer bp, -+ struct bpos *last_flushed_pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bbpos pos = bp_to_bbpos(*bp.v); -+ struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ if (bbpos_cmp(pos, start) < 0 || -+ bbpos_cmp(pos, end) > 0) -+ return 0; -+ -+ k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); -+ ret = bkey_err(k); -+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -+ return 0; -+ if (ret) -+ return ret; -+ -+ if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { -+ *last_flushed_pos = bp.k->p; -+ ret = bch2_btree_write_buffer_flush_sync(trans) ?: -+ -BCH_ERR_transaction_restart_write_buffer_flush; -+ goto out; -+ } -+ -+ if (fsck_err_on(!k.k, c, -+ "backpointer for missing extent\n %s", -+ (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { -+ ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); -+ goto out; -+ } -+out: -+fsck_err: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, -+ struct bbpos start, -+ struct bbpos end) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bpos last_flushed_pos = SPOS_MAX; -+ -+ return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, -+ POS_MIN, BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_one_backpointer(trans, start, end, -+ bkey_s_c_to_backpointer(k), -+ &last_flushed_pos)); -+} -+ -+int bch2_check_backpointers_to_extents(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ while (1) { -+ ret = bch2_get_btree_in_memory_pos(&trans, -+ (1U << BTREE_ID_extents)| -+ (1U << BTREE_ID_reflink), -+ ~0, -+ start, &end); -+ if (ret) -+ break; -+ -+ if (!bbpos_cmp(start, BBPOS_MIN) && -+ bbpos_cmp(end, BBPOS_MAX)) -+ bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", -+ __func__, btree_nodes_fit_in_ram(c)); -+ -+ if (bbpos_cmp(start, BBPOS_MIN) || -+ bbpos_cmp(end, BBPOS_MAX)) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "check_backpointers_to_extents(): "); -+ bch2_bbpos_to_text(&buf, start); -+ prt_str(&buf, "-"); -+ bch2_bbpos_to_text(&buf, end); -+ -+ bch_verbose(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ -+ ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); -+ if (ret || !bbpos_cmp(end, BBPOS_MAX)) -+ break; -+ -+ start = bbpos_successor(end); -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h -new file mode 100644 -index 000000000..547e06176 ---- /dev/null -+++ b/fs/bcachefs/backpointers.h -@@ -0,0 +1,131 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H -+#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H -+ -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "super.h" -+ -+int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); -+void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+void bch2_backpointer_swab(struct bkey_s); -+ -+#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ -+ .key_invalid = bch2_backpointer_invalid, \ -+ .val_to_text = bch2_backpointer_k_to_text, \ -+ .swab = bch2_backpointer_swab, \ -+ .min_val_size = 32, \ -+}) -+ -+#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 -+ -+/* -+ * Convert from pos in backpointer btree to pos of corresponding bucket in alloc -+ * btree: -+ */ -+static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, -+ struct bpos bp_pos) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); -+ u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; -+ -+ return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); -+} -+ -+/* -+ * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: -+ */ -+static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, -+ struct bpos bucket, -+ u64 bucket_offset) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); -+ struct bpos ret; -+ -+ ret = POS(bucket.inode, -+ (bucket_to_sector(ca, bucket.offset) << -+ MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); -+ -+ EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); -+ -+ return ret; -+} -+ -+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *, -+ struct bch_backpointer, struct bkey_s_c, bool); -+ -+static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, -+ struct bpos bucket, -+ struct bch_backpointer bp, -+ struct bkey_s_c orig_k, -+ bool insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_backpointer *bp_k; -+ int ret; -+ -+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); -+ ret = PTR_ERR_OR_ZERO(bp_k); -+ if (ret) -+ return ret; -+ -+ bkey_backpointer_init(&bp_k->k_i); -+ bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); -+ bp_k->v = bp; -+ -+ if (!insert) { -+ bp_k->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&bp_k->k, 0); -+ } -+ -+ if (unlikely(bch2_backpointers_no_use_write_buffer)) -+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert); -+ -+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); -+} -+ -+static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, struct extent_ptr_decoded p) -+{ -+ return level ? BCH_DATA_btree : -+ p.has_ec ? BCH_DATA_stripe : -+ BCH_DATA_user; -+} -+ -+static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ struct bpos *bucket_pos, struct bch_backpointer *bp) -+{ -+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); -+ s64 sectors = level ? btree_sectors(c) : k.k->size; -+ u32 bucket_offset; -+ -+ *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); -+ *bp = (struct bch_backpointer) { -+ .btree_id = btree_id, -+ .level = level, -+ .data_type = data_type, -+ .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + -+ p.crc.offset, -+ .bucket_len = ptr_disk_sectors(sectors, p), -+ .pos = k.k->p, -+ }; -+} -+ -+int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, -+ struct bpos *, struct bch_backpointer *, unsigned); -+struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, -+ struct bpos, struct bch_backpointer, -+ unsigned); -+struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, -+ struct bpos, struct bch_backpointer); -+ -+int bch2_check_btree_backpointers(struct bch_fs *); -+int bch2_check_extents_to_backpointers(struct bch_fs *); -+int bch2_check_backpointers_to_extents(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ -diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h -new file mode 100644 -index 000000000..1fbed1f83 ---- /dev/null -+++ b/fs/bcachefs/bbpos.h -@@ -0,0 +1,48 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BBPOS_H -+#define _BCACHEFS_BBPOS_H -+ -+#include "bkey_methods.h" -+ -+struct bbpos { -+ enum btree_id btree; -+ struct bpos pos; -+}; -+ -+static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) -+{ -+ return (struct bbpos) { btree, pos }; -+} -+ -+#define BBPOS_MIN BBPOS(0, POS_MIN) -+#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) -+ -+static inline int bbpos_cmp(struct bbpos l, struct bbpos r) -+{ -+ return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); -+} -+ -+static inline struct bbpos bbpos_successor(struct bbpos pos) -+{ -+ if (bpos_cmp(pos.pos, SPOS_MAX)) { -+ pos.pos = bpos_successor(pos.pos); -+ return pos; -+ } -+ -+ if (pos.btree != BTREE_ID_NR) { -+ pos.btree++; -+ pos.pos = POS_MIN; -+ return pos; -+ } -+ -+ BUG(); -+} -+ -+static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) -+{ -+ prt_str(out, bch2_btree_ids[pos.btree]); -+ prt_char(out, ':'); -+ bch2_bpos_to_text(out, pos.pos); -+} -+ -+#endif /* _BCACHEFS_BBPOS_H */ -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -new file mode 100644 -index 000000000..30b3d7b9f ---- /dev/null -+++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,1146 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_H -+#define _BCACHEFS_H -+ -+/* -+ * SOME HIGH LEVEL CODE DOCUMENTATION: -+ * -+ * Bcache mostly works with cache sets, cache devices, and backing devices. -+ * -+ * Support for multiple cache devices hasn't quite been finished off yet, but -+ * it's about 95% plumbed through. A cache set and its cache devices is sort of -+ * like a md raid array and its component devices. Most of the code doesn't care -+ * about individual cache devices, the main abstraction is the cache set. -+ * -+ * Multiple cache devices is intended to give us the ability to mirror dirty -+ * cached data and metadata, without mirroring clean cached data. -+ * -+ * Backing devices are different, in that they have a lifetime independent of a -+ * cache set. When you register a newly formatted backing device it'll come up -+ * in passthrough mode, and then you can attach and detach a backing device from -+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly -+ * invalidates any cached data for that backing device. -+ * -+ * A cache set can have multiple (many) backing devices attached to it. -+ * -+ * There's also flash only volumes - this is the reason for the distinction -+ * between struct cached_dev and struct bcache_device. A flash only volume -+ * works much like a bcache device that has a backing device, except the -+ * "cached" data is always dirty. The end result is that we get thin -+ * provisioning with very little additional code. -+ * -+ * Flash only volumes work but they're not production ready because the moving -+ * garbage collector needs more work. More on that later. -+ * -+ * BUCKETS/ALLOCATION: -+ * -+ * Bcache is primarily designed for caching, which means that in normal -+ * operation all of our available space will be allocated. Thus, we need an -+ * efficient way of deleting things from the cache so we can write new things to -+ * it. -+ * -+ * To do this, we first divide the cache device up into buckets. A bucket is the -+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ -+ * works efficiently. -+ * -+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with -+ * it. The gens and priorities for all the buckets are stored contiguously and -+ * packed on disk (in a linked list of buckets - aside from the superblock, all -+ * of bcache's metadata is stored in buckets). -+ * -+ * The priority is used to implement an LRU. We reset a bucket's priority when -+ * we allocate it or on cache it, and every so often we decrement the priority -+ * of each bucket. It could be used to implement something more sophisticated, -+ * if anyone ever gets around to it. -+ * -+ * The generation is used for invalidating buckets. Each pointer also has an 8 -+ * bit generation embedded in it; for a pointer to be considered valid, its gen -+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all -+ * we have to do is increment its gen (and write its new gen to disk; we batch -+ * this up). -+ * -+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that -+ * contain metadata (including btree nodes). -+ * -+ * THE BTREE: -+ * -+ * Bcache is in large part design around the btree. -+ * -+ * At a high level, the btree is just an index of key -> ptr tuples. -+ * -+ * Keys represent extents, and thus have a size field. Keys also have a variable -+ * number of pointers attached to them (potentially zero, which is handy for -+ * invalidating the cache). -+ * -+ * The key itself is an inode:offset pair. The inode number corresponds to a -+ * backing device or a flash only volume. The offset is the ending offset of the -+ * extent within the inode - not the starting offset; this makes lookups -+ * slightly more convenient. -+ * -+ * Pointers contain the cache device id, the offset on that device, and an 8 bit -+ * generation number. More on the gen later. -+ * -+ * Index lookups are not fully abstracted - cache lookups in particular are -+ * still somewhat mixed in with the btree code, but things are headed in that -+ * direction. -+ * -+ * Updates are fairly well abstracted, though. There are two different ways of -+ * updating the btree; insert and replace. -+ * -+ * BTREE_INSERT will just take a list of keys and insert them into the btree - -+ * overwriting (possibly only partially) any extents they overlap with. This is -+ * used to update the index after a write. -+ * -+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is -+ * overwriting a key that matches another given key. This is used for inserting -+ * data into the cache after a cache miss, and for background writeback, and for -+ * the moving garbage collector. -+ * -+ * There is no "delete" operation; deleting things from the index is -+ * accomplished by either by invalidating pointers (by incrementing a bucket's -+ * gen) or by inserting a key with 0 pointers - which will overwrite anything -+ * previously present at that location in the index. -+ * -+ * This means that there are always stale/invalid keys in the btree. They're -+ * filtered out by the code that iterates through a btree node, and removed when -+ * a btree node is rewritten. -+ * -+ * BTREE NODES: -+ * -+ * Our unit of allocation is a bucket, and we can't arbitrarily allocate and -+ * free smaller than a bucket - so, that's how big our btree nodes are. -+ * -+ * (If buckets are really big we'll only use part of the bucket for a btree node -+ * - no less than 1/4th - but a bucket still contains no more than a single -+ * btree node. I'd actually like to change this, but for now we rely on the -+ * bucket's gen for deleting btree nodes when we rewrite/split a node.) -+ * -+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook -+ * btree implementation. -+ * -+ * The way this is solved is that btree nodes are internally log structured; we -+ * can append new keys to an existing btree node without rewriting it. This -+ * means each set of keys we write is sorted, but the node is not. -+ * -+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would -+ * be expensive, and we have to distinguish between the keys we have written and -+ * the keys we haven't. So to do a lookup in a btree node, we have to search -+ * each sorted set. But we do merge written sets together lazily, so the cost of -+ * these extra searches is quite low (normally most of the keys in a btree node -+ * will be in one big set, and then there'll be one or two sets that are much -+ * smaller). -+ * -+ * This log structure makes bcache's btree more of a hybrid between a -+ * conventional btree and a compacting data structure, with some of the -+ * advantages of both. -+ * -+ * GARBAGE COLLECTION: -+ * -+ * We can't just invalidate any bucket - it might contain dirty data or -+ * metadata. If it once contained dirty data, other writes might overwrite it -+ * later, leaving no valid pointers into that bucket in the index. -+ * -+ * Thus, the primary purpose of garbage collection is to find buckets to reuse. -+ * It also counts how much valid data it each bucket currently contains, so that -+ * allocation can reuse buckets sooner when they've been mostly overwritten. -+ * -+ * It also does some things that are really internal to the btree -+ * implementation. If a btree node contains pointers that are stale by more than -+ * some threshold, it rewrites the btree node to avoid the bucket's generation -+ * wrapping around. It also merges adjacent btree nodes if they're empty enough. -+ * -+ * THE JOURNAL: -+ * -+ * Bcache's journal is not necessary for consistency; we always strictly -+ * order metadata writes so that the btree and everything else is consistent on -+ * disk in the event of an unclean shutdown, and in fact bcache had writeback -+ * caching (with recovery from unclean shutdown) before journalling was -+ * implemented. -+ * -+ * Rather, the journal is purely a performance optimization; we can't complete a -+ * write until we've updated the index on disk, otherwise the cache would be -+ * inconsistent in the event of an unclean shutdown. This means that without the -+ * journal, on random write workloads we constantly have to update all the leaf -+ * nodes in the btree, and those writes will be mostly empty (appending at most -+ * a few keys each) - highly inefficient in terms of amount of metadata writes, -+ * and it puts more strain on the various btree resorting/compacting code. -+ * -+ * The journal is just a log of keys we've inserted; on startup we just reinsert -+ * all the keys in the open journal entries. That means that when we're updating -+ * a node in the btree, we can wait until a 4k block of keys fills up before -+ * writing them out. -+ * -+ * For simplicity, we only journal updates to leaf nodes; updates to parent -+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth -+ * the complexity to deal with journalling them (in particular, journal replay) -+ * - updates to non leaf nodes just happen synchronously (see btree_split()). -+ */ -+ -+#undef pr_fmt -+#ifdef __KERNEL__ -+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -+#else -+#define pr_fmt(fmt) "%s() " fmt "\n", __func__ -+#endif -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "errcode.h" -+#include "fifo.h" -+#include "nocow_locking_types.h" -+#include "opts.h" -+#include "recovery_types.h" -+#include "seqmutex.h" -+#include "util.h" -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCH_WRITE_REF_DEBUG -+#endif -+ -+#ifndef dynamic_fault -+#define dynamic_fault(...) 0 -+#endif -+ -+#define race_fault(...) dynamic_fault("bcachefs:race") -+ -+#define trace_and_count(_c, _name, ...) \ -+do { \ -+ this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ -+ trace_##_name(__VA_ARGS__); \ -+} while (0) -+ -+#define bch2_fs_init_fault(name) \ -+ dynamic_fault("bcachefs:bch_fs_init:" name) -+#define bch2_meta_read_fault(name) \ -+ dynamic_fault("bcachefs:meta:read:" name) -+#define bch2_meta_write_fault(name) \ -+ dynamic_fault("bcachefs:meta:write:" name) -+ -+#ifdef __KERNEL__ -+#define BCACHEFS_LOG_PREFIX -+#endif -+ -+#ifdef BCACHEFS_LOG_PREFIX -+ -+#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) -+#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) -+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) -+#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) -+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ -+ "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) -+ -+#else -+ -+#define bch2_log_msg(_c, fmt) fmt -+#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) -+#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) -+#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) -+#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ -+ "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) -+ -+#endif -+ -+#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") -+ -+#define bch_info(c, fmt, ...) \ -+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_notice(c, fmt, ...) \ -+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn(c, fmt, ...) \ -+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+ -+#define bch_err(c, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err_dev(ca, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -+#define bch_err_dev_offset(ca, _offset, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -+#define bch_err_inum(c, _inum, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -+#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) -+ -+#define bch_err_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err_dev_ratelimited(ca, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) -+#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) -+#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) -+#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) -+ -+#define bch_err_fn(_c, _ret) \ -+ bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) -+#define bch_err_msg(_c, _ret, _msg, ...) \ -+ bch_err(_c, "%s(): error " _msg " %s", __func__, ##__VA_ARGS__, bch2_err_str(_ret)) -+ -+#define bch_verbose(c, fmt, ...) \ -+do { \ -+ if ((c)->opts.verbose) \ -+ bch_info(c, fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+#define pr_verbose_init(opts, fmt, ...) \ -+do { \ -+ if (opt_get(opts, verbose)) \ -+ pr_info(fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+/* Parameters that are useful for debugging, but should always be compiled in: */ -+#define BCH_DEBUG_PARAMS_ALWAYS() \ -+ BCH_DEBUG_PARAM(key_merging_disabled, \ -+ "Disables merging of extents") \ -+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ -+ "Causes mark and sweep to compact and rewrite every " \ -+ "btree node it traverses") \ -+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ -+ "Disables rewriting of btree nodes during mark and sweep")\ -+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ -+ "Disables the shrinker callback for the btree node cache")\ -+ BCH_DEBUG_PARAM(verify_btree_ondisk, \ -+ "Reread btree nodes at various points to verify the " \ -+ "mergesort in the read path against modifications " \ -+ "done in memory") \ -+ BCH_DEBUG_PARAM(verify_all_btree_replicas, \ -+ "When reading btree nodes, read all replicas and " \ -+ "compare them") \ -+ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ -+ "Don't use the write buffer for backpointers, enabling "\ -+ "extra runtime checks") -+ -+/* Parameters that should only be compiled in debug mode: */ -+#define BCH_DEBUG_PARAMS_DEBUG() \ -+ BCH_DEBUG_PARAM(expensive_debug_checks, \ -+ "Enables various runtime debugging checks that " \ -+ "significantly affect performance") \ -+ BCH_DEBUG_PARAM(debug_check_iterators, \ -+ "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ -+ "Verify btree accounting for keys within a node") \ -+ BCH_DEBUG_PARAM(journal_seq_verify, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(inject_invalid_keys, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(test_alloc_startup, \ -+ "Force allocator startup to use the slowpath where it" \ -+ "can't find enough free buckets without invalidating" \ -+ "cached data") \ -+ BCH_DEBUG_PARAM(force_reconstruct_read, \ -+ "Force reads to use the reconstruct path, when reading" \ -+ "from erasure coded extents") \ -+ BCH_DEBUG_PARAM(test_restart_gc, \ -+ "Test restarting mark and sweep gc when bucket gens change") -+ -+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -+#else -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -+#endif -+ -+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#ifndef CONFIG_BCACHEFS_DEBUG -+#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+#endif -+ -+#define BCH_TIME_STATS() \ -+ x(btree_node_mem_alloc) \ -+ x(btree_node_split) \ -+ x(btree_node_compact) \ -+ x(btree_node_merge) \ -+ x(btree_node_sort) \ -+ x(btree_node_read) \ -+ x(btree_interior_update_foreground) \ -+ x(btree_interior_update_total) \ -+ x(btree_gc) \ -+ x(data_write) \ -+ x(data_read) \ -+ x(data_promote) \ -+ x(journal_flush_write) \ -+ x(journal_noflush_write) \ -+ x(journal_flush_seq) \ -+ x(blocked_journal) \ -+ x(blocked_allocate) \ -+ x(blocked_allocate_open_bucket) \ -+ x(nocow_lock_contended) -+ -+enum bch_time_stats { -+#define x(name) BCH_TIME_##name, -+ BCH_TIME_STATS() -+#undef x -+ BCH_TIME_STAT_NR -+}; -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "btree_write_buffer_types.h" -+#include "buckets_types.h" -+#include "buckets_waiting_for_journal_types.h" -+#include "clock_types.h" -+#include "ec_types.h" -+#include "journal_types.h" -+#include "keylist_types.h" -+#include "quota_types.h" -+#include "rebalance_types.h" -+#include "replicas_types.h" -+#include "subvolume_types.h" -+#include "super_types.h" -+ -+/* Number of nodes btree coalesce will try to coalesce at once */ -+#define GC_MERGE_NODES 4U -+ -+/* Maximum number of nodes we might need to allocate atomically: */ -+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) -+ -+/* Size of the freelist we allocate btree nodes from: */ -+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) -+ -+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) -+ -+struct btree; -+ -+enum gc_phase { -+ GC_PHASE_NOT_RUNNING, -+ GC_PHASE_START, -+ GC_PHASE_SB, -+ -+ GC_PHASE_BTREE_stripes, -+ GC_PHASE_BTREE_extents, -+ GC_PHASE_BTREE_inodes, -+ GC_PHASE_BTREE_dirents, -+ GC_PHASE_BTREE_xattrs, -+ GC_PHASE_BTREE_alloc, -+ GC_PHASE_BTREE_quotas, -+ GC_PHASE_BTREE_reflink, -+ GC_PHASE_BTREE_subvolumes, -+ GC_PHASE_BTREE_snapshots, -+ GC_PHASE_BTREE_lru, -+ GC_PHASE_BTREE_freespace, -+ GC_PHASE_BTREE_need_discard, -+ GC_PHASE_BTREE_backpointers, -+ GC_PHASE_BTREE_bucket_gens, -+ GC_PHASE_BTREE_snapshot_trees, -+ GC_PHASE_BTREE_deleted_inodes, -+ -+ GC_PHASE_PENDING_DELETE, -+}; -+ -+struct gc_pos { -+ enum gc_phase phase; -+ struct bpos pos; -+ unsigned level; -+}; -+ -+struct reflink_gc { -+ u64 offset; -+ u32 size; -+ u32 refcount; -+}; -+ -+typedef GENRADIX(struct reflink_gc) reflink_gc_table; -+ -+struct io_count { -+ u64 sectors[2][BCH_DATA_NR]; -+}; -+ -+struct bch_dev { -+ struct kobject kobj; -+ struct percpu_ref ref; -+ struct completion ref_completion; -+ struct percpu_ref io_ref; -+ struct completion io_ref_completion; -+ -+ struct bch_fs *fs; -+ -+ u8 dev_idx; -+ /* -+ * Cached version of this device's member info from superblock -+ * Committed by bch2_write_super() -> bch_fs_mi_update() -+ */ -+ struct bch_member_cpu mi; -+ __uuid_t uuid; -+ char name[BDEVNAME_SIZE]; -+ -+ struct bch_sb_handle disk_sb; -+ struct bch_sb *sb_read_scratch; -+ int sb_write_error; -+ dev_t dev; -+ atomic_t flush_seq; -+ -+ struct bch_devs_mask self; -+ -+ /* biosets used in cloned bios for writing multiple replicas */ -+ struct bio_set replica_set; -+ -+ /* -+ * Buckets: -+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and -+ * gc_lock, for device resize - holding any is sufficient for access: -+ * Or rcu_read_lock(), but only for ptr_stale(): -+ */ -+ struct bucket_array __rcu *buckets_gc; -+ struct bucket_gens __rcu *bucket_gens; -+ u8 *oldest_gen; -+ unsigned long *buckets_nouse; -+ struct rw_semaphore bucket_lock; -+ -+ struct bch_dev_usage *usage_base; -+ struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; -+ struct bch_dev_usage __percpu *usage_gc; -+ -+ /* Allocator: */ -+ u64 new_fs_bucket_idx; -+ u64 alloc_cursor; -+ -+ unsigned nr_open_buckets; -+ unsigned nr_btree_reserve; -+ -+ size_t inc_gen_needs_gc; -+ size_t inc_gen_really_needs_gc; -+ size_t buckets_waiting_on_journal; -+ -+ atomic64_t rebalance_work; -+ -+ struct journal_device journal; -+ u64 prev_journal_sector; -+ -+ struct work_struct io_error_work; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic64_t cur_latency[2]; -+ struct bch2_time_stats io_latency[2]; -+ -+#define CONGESTED_MAX 1024 -+ atomic_t congested; -+ u64 congested_last; -+ -+ struct io_count __percpu *io_done; -+}; -+ -+enum { -+ /* startup: */ -+ BCH_FS_STARTED, -+ BCH_FS_MAY_GO_RW, -+ BCH_FS_RW, -+ BCH_FS_WAS_RW, -+ -+ /* shutdown: */ -+ BCH_FS_STOPPING, -+ BCH_FS_EMERGENCY_RO, -+ BCH_FS_GOING_RO, -+ BCH_FS_WRITE_DISABLE_COMPLETE, -+ BCH_FS_CLEAN_SHUTDOWN, -+ -+ /* fsck passes: */ -+ BCH_FS_FSCK_DONE, -+ BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ -+ BCH_FS_NEED_ANOTHER_GC, -+ -+ BCH_FS_HAVE_DELETED_SNAPSHOTS, -+ -+ /* errors: */ -+ BCH_FS_ERROR, -+ BCH_FS_TOPOLOGY_ERROR, -+ BCH_FS_ERRORS_FIXED, -+ BCH_FS_ERRORS_NOT_FIXED, -+}; -+ -+struct btree_debug { -+ unsigned id; -+}; -+ -+#define BCH_TRANSACTIONS_NR 128 -+ -+struct btree_transaction_stats { -+ struct bch2_time_stats lock_hold_times; -+ struct mutex lock; -+ unsigned nr_max_paths; -+ unsigned wb_updates_size; -+ unsigned max_mem; -+ char *max_paths_text; -+}; -+ -+struct bch_fs_pcpu { -+ u64 sectors_available; -+}; -+ -+struct journal_seq_blacklist_table { -+ size_t nr; -+ struct journal_seq_blacklist_table_entry { -+ u64 start; -+ u64 end; -+ bool dirty; -+ } entries[0]; -+}; -+ -+struct journal_keys { -+ struct journal_key { -+ u64 journal_seq; -+ u32 journal_offset; -+ enum btree_id btree_id:8; -+ unsigned level:8; -+ bool allocated; -+ bool overwritten; -+ struct bkey_i *k; -+ } *d; -+ /* -+ * Gap buffer: instead of all the empty space in the array being at the -+ * end of the buffer - from @nr to @size - the empty space is at @gap. -+ * This means that sequential insertions are O(n) instead of O(n^2). -+ */ -+ size_t gap; -+ size_t nr; -+ size_t size; -+}; -+ -+struct btree_path_buf { -+ struct btree_path *path; -+}; -+ -+#define REPLICAS_DELTA_LIST_MAX (1U << 16) -+ -+#define BCACHEFS_ROOT_SUBVOL_INUM \ -+ ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) -+ -+#define BCH_WRITE_REFS() \ -+ x(trans) \ -+ x(write) \ -+ x(promote) \ -+ x(node_rewrite) \ -+ x(stripe_create) \ -+ x(stripe_delete) \ -+ x(reflink) \ -+ x(fallocate) \ -+ x(discard) \ -+ x(invalidate) \ -+ x(delete_dead_snapshots) \ -+ x(snapshot_delete_pagecache) \ -+ x(sysfs) -+ -+enum bch_write_ref { -+#define x(n) BCH_WRITE_REF_##n, -+ BCH_WRITE_REFS() -+#undef x -+ BCH_WRITE_REF_NR, -+}; -+ -+struct bch_fs { -+ struct closure cl; -+ -+ struct list_head list; -+ struct kobject kobj; -+ struct kobject counters_kobj; -+ struct kobject internal; -+ struct kobject opts_dir; -+ struct kobject time_stats; -+ unsigned long flags; -+ -+ int minor; -+ struct device *chardev; -+ struct super_block *vfs_sb; -+ dev_t dev; -+ char name[40]; -+ -+ /* ro/rw, add/remove/resize devices: */ -+ struct rw_semaphore state_lock; -+ -+ /* Counts outstanding writes, for clean transition to read-only */ -+#ifdef BCH_WRITE_REF_DEBUG -+ atomic_long_t writes[BCH_WRITE_REF_NR]; -+#else -+ struct percpu_ref writes; -+#endif -+ struct work_struct read_only_work; -+ -+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; -+ -+ struct bch_replicas_cpu replicas; -+ struct bch_replicas_cpu replicas_gc; -+ struct mutex replicas_gc_lock; -+ mempool_t replicas_delta_pool; -+ -+ struct journal_entry_res btree_root_journal_res; -+ struct journal_entry_res replicas_journal_res; -+ struct journal_entry_res clock_journal_res; -+ struct journal_entry_res dev_usage_journal_res; -+ -+ struct bch_disk_groups_cpu __rcu *disk_groups; -+ -+ struct bch_opts opts; -+ -+ /* Updated by bch2_sb_update():*/ -+ struct { -+ __uuid_t uuid; -+ __uuid_t user_uuid; -+ -+ u16 version; -+ u16 version_min; -+ u16 version_upgrade_complete; -+ -+ u8 nr_devices; -+ u8 clean; -+ -+ u8 encryption_type; -+ -+ u64 time_base_lo; -+ u32 time_base_hi; -+ unsigned time_units_per_sec; -+ unsigned nsec_per_time_unit; -+ u64 features; -+ u64 compat; -+ } sb; -+ -+ -+ struct bch_sb_handle disk_sb; -+ -+ unsigned short block_bits; /* ilog2(block_size) */ -+ -+ u16 btree_foreground_merge_threshold; -+ -+ struct closure sb_write; -+ struct mutex sb_lock; -+ -+ /* snapshot.c: */ -+ struct snapshot_table __rcu *snapshots; -+ size_t snapshot_table_size; -+ struct mutex snapshot_table_lock; -+ -+ struct work_struct snapshot_delete_work; -+ struct work_struct snapshot_wait_for_pagecache_and_delete_work; -+ snapshot_id_list snapshots_unlinked; -+ struct mutex snapshots_unlinked_lock; -+ -+ /* BTREE CACHE */ -+ struct bio_set btree_bio; -+ struct workqueue_struct *io_complete_wq; -+ -+ struct btree_root btree_roots_known[BTREE_ID_NR]; -+ DARRAY(struct btree_root) btree_roots_extra; -+ struct mutex btree_root_lock; -+ -+ struct btree_cache btree_cache; -+ -+ /* -+ * Cache of allocated btree nodes - if we allocate a btree node and -+ * don't use it, if we free it that space can't be reused until going -+ * _all_ the way through the allocator (which exposes us to a livelock -+ * when allocating btree reserves fail halfway through) - instead, we -+ * can stick them here: -+ */ -+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; -+ unsigned btree_reserve_cache_nr; -+ struct mutex btree_reserve_cache_lock; -+ -+ mempool_t btree_interior_update_pool; -+ struct list_head btree_interior_update_list; -+ struct list_head btree_interior_updates_unwritten; -+ struct mutex btree_interior_update_lock; -+ struct closure_waitlist btree_interior_update_wait; -+ -+ struct workqueue_struct *btree_interior_update_worker; -+ struct work_struct btree_interior_update_work; -+ -+ struct list_head pending_node_rewrites; -+ struct mutex pending_node_rewrites_lock; -+ -+ /* btree_io.c: */ -+ spinlock_t btree_write_error_lock; -+ struct btree_write_stats { -+ atomic64_t nr; -+ atomic64_t bytes; -+ } btree_write_stats[BTREE_WRITE_TYPE_NR]; -+ -+ /* btree_iter.c: */ -+ struct seqmutex btree_trans_lock; -+ struct list_head btree_trans_list; -+ mempool_t btree_paths_pool; -+ mempool_t btree_trans_mem_pool; -+ struct btree_path_buf __percpu *btree_paths_bufs; -+ -+ struct srcu_struct btree_trans_barrier; -+ bool btree_trans_barrier_initialized; -+ -+ struct btree_key_cache btree_key_cache; -+ unsigned btree_key_cache_btrees; -+ -+ struct btree_write_buffer btree_write_buffer; -+ -+ struct workqueue_struct *btree_update_wq; -+ struct workqueue_struct *btree_io_complete_wq; -+ /* copygc needs its own workqueue for index updates.. */ -+ struct workqueue_struct *copygc_wq; -+ /* -+ * Use a dedicated wq for write ref holder tasks. Required to avoid -+ * dependency problems with other wq tasks that can block on ref -+ * draining, such as read-only transition. -+ */ -+ struct workqueue_struct *write_ref_wq; -+ -+ /* ALLOCATION */ -+ struct bch_devs_mask rw_devs[BCH_DATA_NR]; -+ -+ u64 capacity; /* sectors */ -+ -+ /* -+ * When capacity _decreases_ (due to a disk being removed), we -+ * increment capacity_gen - this invalidates outstanding reservations -+ * and forces them to be revalidated -+ */ -+ u32 capacity_gen; -+ unsigned bucket_size_max; -+ -+ atomic64_t sectors_available; -+ struct mutex sectors_available_lock; -+ -+ struct bch_fs_pcpu __percpu *pcpu; -+ -+ struct percpu_rw_semaphore mark_lock; -+ -+ seqcount_t usage_lock; -+ struct bch_fs_usage *usage_base; -+ struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; -+ struct bch_fs_usage __percpu *usage_gc; -+ u64 __percpu *online_reserved; -+ -+ /* single element mempool: */ -+ struct mutex usage_scratch_lock; -+ struct bch_fs_usage_online *usage_scratch; -+ -+ struct io_clock io_clock[2]; -+ -+ /* JOURNAL SEQ BLACKLIST */ -+ struct journal_seq_blacklist_table * -+ journal_seq_blacklist_table; -+ struct work_struct journal_seq_blacklist_gc_work; -+ -+ /* ALLOCATOR */ -+ spinlock_t freelist_lock; -+ struct closure_waitlist freelist_wait; -+ u64 blocked_allocate; -+ u64 blocked_allocate_open_bucket; -+ -+ open_bucket_idx_t open_buckets_freelist; -+ open_bucket_idx_t open_buckets_nr_free; -+ struct closure_waitlist open_buckets_wait; -+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; -+ open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; -+ -+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; -+ open_bucket_idx_t open_buckets_partial_nr; -+ -+ struct write_point btree_write_point; -+ struct write_point rebalance_write_point; -+ -+ struct write_point write_points[WRITE_POINT_MAX]; -+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; -+ struct mutex write_points_hash_lock; -+ unsigned write_points_nr; -+ -+ struct buckets_waiting_for_journal buckets_waiting_for_journal; -+ struct work_struct discard_work; -+ struct work_struct invalidate_work; -+ -+ /* GARBAGE COLLECTION */ -+ struct task_struct *gc_thread; -+ atomic_t kick_gc; -+ unsigned long gc_count; -+ -+ enum btree_id gc_gens_btree; -+ struct bpos gc_gens_pos; -+ -+ /* -+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] -+ * has been marked by GC. -+ * -+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) -+ * -+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread -+ * can read without a lock. -+ */ -+ seqcount_t gc_pos_lock; -+ struct gc_pos gc_pos; -+ -+ /* -+ * The allocation code needs gc_mark in struct bucket to be correct, but -+ * it's not while a gc is in progress. -+ */ -+ struct rw_semaphore gc_lock; -+ struct mutex gc_gens_lock; -+ -+ /* IO PATH */ -+ struct semaphore io_in_flight; -+ struct bio_set bio_read; -+ struct bio_set bio_read_split; -+ struct bio_set bio_write; -+ struct mutex bio_bounce_pages_lock; -+ mempool_t bio_bounce_pages; -+ struct bucket_nocow_lock_table -+ nocow_locks; -+ struct rhashtable promote_table; -+ -+ mempool_t compression_bounce[2]; -+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; -+ mempool_t decompress_workspace; -+ ZSTD_parameters zstd_params; -+ -+ struct crypto_shash *sha256; -+ struct crypto_sync_skcipher *chacha20; -+ struct crypto_shash *poly1305; -+ -+ atomic64_t key_version; -+ -+ mempool_t large_bkey_pool; -+ -+ /* MOVE.C */ -+ struct list_head moving_context_list; -+ struct mutex moving_context_lock; -+ -+ struct list_head data_progress_list; -+ struct mutex data_progress_lock; -+ -+ /* REBALANCE */ -+ struct bch_fs_rebalance rebalance; -+ -+ /* COPYGC */ -+ struct task_struct *copygc_thread; -+ struct write_point copygc_write_point; -+ s64 copygc_wait_at; -+ s64 copygc_wait; -+ bool copygc_running; -+ wait_queue_head_t copygc_running_wq; -+ -+ /* STRIPES: */ -+ GENRADIX(struct stripe) stripes; -+ GENRADIX(struct gc_stripe) gc_stripes; -+ -+ struct hlist_head ec_stripes_new[32]; -+ spinlock_t ec_stripes_new_lock; -+ -+ ec_stripes_heap ec_stripes_heap; -+ struct mutex ec_stripes_heap_lock; -+ -+ /* ERASURE CODING */ -+ struct list_head ec_stripe_head_list; -+ struct mutex ec_stripe_head_lock; -+ -+ struct list_head ec_stripe_new_list; -+ struct mutex ec_stripe_new_lock; -+ wait_queue_head_t ec_stripe_new_wait; -+ -+ struct work_struct ec_stripe_create_work; -+ u64 ec_stripe_hint; -+ -+ struct work_struct ec_stripe_delete_work; -+ -+ struct bio_set ec_bioset; -+ -+ /* REFLINK */ -+ reflink_gc_table reflink_gc_table; -+ size_t reflink_gc_nr; -+ -+ /* fs.c */ -+ struct list_head vfs_inodes_list; -+ struct mutex vfs_inodes_lock; -+ -+ /* VFS IO PATH - fs-io.c */ -+ struct bio_set writepage_bioset; -+ struct bio_set dio_write_bioset; -+ struct bio_set dio_read_bioset; -+ struct bio_set nocow_flush_bioset; -+ -+ /* ERRORS */ -+ struct list_head fsck_errors; -+ struct mutex fsck_error_lock; -+ bool fsck_alloc_err; -+ -+ /* QUOTAS */ -+ struct bch_memquota_type quotas[QTYP_NR]; -+ -+ /* RECOVERY */ -+ u64 journal_replay_seq_start; -+ u64 journal_replay_seq_end; -+ enum bch_recovery_pass curr_recovery_pass; -+ /* bitmap of explicitly enabled recovery passes: */ -+ u64 recovery_passes_explicit; -+ u64 recovery_passes_complete; -+ -+ /* DEBUG JUNK */ -+ struct dentry *fs_debug_dir; -+ struct dentry *btree_debug_dir; -+ struct btree_debug btree_debug[BTREE_ID_NR]; -+ struct btree *verify_data; -+ struct btree_node *verify_ondisk; -+ struct mutex verify_lock; -+ -+ u64 *unused_inode_hints; -+ unsigned inode_shard_bits; -+ -+ /* -+ * A btree node on disk could have too many bsets for an iterator to fit -+ * on the stack - have to dynamically allocate them -+ */ -+ mempool_t fill_iter; -+ -+ mempool_t btree_bounce_pool; -+ -+ struct journal journal; -+ GENRADIX(struct journal_replay *) journal_entries; -+ u64 journal_entries_base_seq; -+ struct journal_keys journal_keys; -+ struct list_head journal_iters; -+ -+ u64 last_bucket_seq_cleanup; -+ -+ u64 counters_on_mount[BCH_COUNTER_NR]; -+ u64 __percpu *counters; -+ -+ unsigned btree_gc_periodic:1; -+ unsigned copy_gc_enabled:1; -+ bool promote_whole_extents; -+ -+ struct bch2_time_stats times[BCH_TIME_STAT_NR]; -+ -+ struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; -+}; -+ -+extern struct wait_queue_head bch2_read_only_wait; -+ -+static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) -+{ -+#ifdef BCH_WRITE_REF_DEBUG -+ atomic_long_inc(&c->writes[ref]); -+#else -+ percpu_ref_get(&c->writes); -+#endif -+} -+ -+static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -+{ -+#ifdef BCH_WRITE_REF_DEBUG -+ return !test_bit(BCH_FS_GOING_RO, &c->flags) && -+ atomic_long_inc_not_zero(&c->writes[ref]); -+#else -+ return percpu_ref_tryget_live(&c->writes); -+#endif -+} -+ -+static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) -+{ -+#ifdef BCH_WRITE_REF_DEBUG -+ long v = atomic_long_dec_return(&c->writes[ref]); -+ -+ BUG_ON(v < 0); -+ if (v) -+ return; -+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) -+ if (atomic_long_read(&c->writes[i])) -+ return; -+ -+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ wake_up(&bch2_read_only_wait); -+#else -+ percpu_ref_put(&c->writes); -+#endif -+} -+ -+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -+{ -+#ifndef NO_BCACHEFS_FS -+ if (c->vfs_sb) -+ c->vfs_sb->s_bdi->ra_pages = ra_pages; -+#endif -+} -+ -+static inline unsigned bucket_bytes(const struct bch_dev *ca) -+{ -+ return ca->mi.bucket_size << 9; -+} -+ -+static inline unsigned block_bytes(const struct bch_fs *c) -+{ -+ return c->opts.block_size; -+} -+ -+static inline unsigned block_sectors(const struct bch_fs *c) -+{ -+ return c->opts.block_size >> 9; -+} -+ -+static inline size_t btree_sectors(const struct bch_fs *c) -+{ -+ return c->opts.btree_node_size >> 9; -+} -+ -+static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) -+{ -+ return c->btree_key_cache_btrees & (1U << btree); -+} -+ -+static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) -+{ -+ struct timespec64 t; -+ s32 rem; -+ -+ time += c->sb.time_base_lo; -+ -+ t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); -+ t.tv_nsec = rem * c->sb.nsec_per_time_unit; -+ return t; -+} -+ -+static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) -+{ -+ return (ts.tv_sec * c->sb.time_units_per_sec + -+ (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; -+} -+ -+static inline s64 bch2_current_time(const struct bch_fs *c) -+{ -+ struct timespec64 now; -+ -+ ktime_get_coarse_real_ts64(&now); -+ return timespec_to_bch2_time(c, now); -+} -+ -+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -+{ -+ return dev < c->sb.nr_devices && c->devs[dev]; -+} -+ -+#define BKEY_PADDED_ONSTACK(key, pad) \ -+ struct { struct bkey_i key; __u64 key ## _pad[pad]; } -+ -+#endif /* _BCACHEFS_H */ -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -new file mode 100644 -index 000000000..f17238be4 ---- /dev/null -+++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,2368 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FORMAT_H -+#define _BCACHEFS_FORMAT_H -+ -+/* -+ * bcachefs on disk data structures -+ * -+ * OVERVIEW: -+ * -+ * There are three main types of on disk data structures in bcachefs (this is -+ * reduced from 5 in bcache) -+ * -+ * - superblock -+ * - journal -+ * - btree -+ * -+ * The btree is the primary structure; most metadata exists as keys in the -+ * various btrees. There are only a small number of btrees, they're not -+ * sharded - we have one btree for extents, another for inodes, et cetera. -+ * -+ * SUPERBLOCK: -+ * -+ * The superblock contains the location of the journal, the list of devices in -+ * the filesystem, and in general any metadata we need in order to decide -+ * whether we can start a filesystem or prior to reading the journal/btree -+ * roots. -+ * -+ * The superblock is extensible, and most of the contents of the superblock are -+ * in variable length, type tagged fields; see struct bch_sb_field. -+ * -+ * Backup superblocks do not reside in a fixed location; also, superblocks do -+ * not have a fixed size. To locate backup superblocks we have struct -+ * bch_sb_layout; we store a copy of this inside every superblock, and also -+ * before the first superblock. -+ * -+ * JOURNAL: -+ * -+ * The journal primarily records btree updates in the order they occurred; -+ * journal replay consists of just iterating over all the keys in the open -+ * journal entries and re-inserting them into the btrees. -+ * -+ * The journal also contains entry types for the btree roots, and blacklisted -+ * journal sequence numbers (see journal_seq_blacklist.c). -+ * -+ * BTREE: -+ * -+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically -+ * 128k-256k) and log structured. We use struct btree_node for writing the first -+ * entry in a given node (offset 0), and struct btree_node_entry for all -+ * subsequent writes. -+ * -+ * After the header, btree node entries contain a list of keys in sorted order. -+ * Values are stored inline with the keys; since values are variable length (and -+ * keys effectively are variable length too, due to packing) we can't do random -+ * access without building up additional in memory tables in the btree node read -+ * path. -+ * -+ * BTREE KEYS (struct bkey): -+ * -+ * The various btrees share a common format for the key - so as to avoid -+ * switching in fastpath lookup/comparison code - but define their own -+ * structures for the key values. -+ * -+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max -+ * size is just under 2k. The common part also contains a type tag for the -+ * value, and a format field indicating whether the key is packed or not (and -+ * also meant to allow adding new key fields in the future, if desired). -+ * -+ * bkeys, when stored within a btree node, may also be packed. In that case, the -+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can -+ * be generous with field sizes in the common part of the key format (64 bit -+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. -+ */ -+ -+#include -+#include -+#include -+#include -+#include "vstructs.h" -+ -+#ifdef __KERNEL__ -+typedef uuid_t __uuid_t; -+#endif -+ -+#define BITMASK(name, type, field, offset, end) \ -+static const unsigned name##_OFFSET = offset; \ -+static const unsigned name##_BITS = (end - offset); \ -+ \ -+static inline __u64 name(const type *k) \ -+{ \ -+ return (k->field >> offset) & ~(~0ULL << (end - offset)); \ -+} \ -+ \ -+static inline void SET_##name(type *k, __u64 v) \ -+{ \ -+ k->field &= ~(~(~0ULL << (end - offset)) << offset); \ -+ k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ -+} -+ -+#define LE_BITMASK(_bits, name, type, field, offset, end) \ -+static const unsigned name##_OFFSET = offset; \ -+static const unsigned name##_BITS = (end - offset); \ -+static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ -+ \ -+static inline __u64 name(const type *k) \ -+{ \ -+ return (__le##_bits##_to_cpu(k->field) >> offset) & \ -+ ~(~0ULL << (end - offset)); \ -+} \ -+ \ -+static inline void SET_##name(type *k, __u64 v) \ -+{ \ -+ __u##_bits new = __le##_bits##_to_cpu(k->field); \ -+ \ -+ new &= ~(~(~0ULL << (end - offset)) << offset); \ -+ new |= (v & ~(~0ULL << (end - offset))) << offset; \ -+ k->field = __cpu_to_le##_bits(new); \ -+} -+ -+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) -+ -+struct bkey_format { -+ __u8 key_u64s; -+ __u8 nr_fields; -+ /* One unused slot for now: */ -+ __u8 bits_per_field[6]; -+ __le64 field_offset[6]; -+}; -+ -+/* Btree keys - all units are in sectors */ -+ -+struct bpos { -+ /* -+ * Word order matches machine byte order - btree code treats a bpos as a -+ * single large integer, for search/comparison purposes -+ * -+ * Note that wherever a bpos is embedded in another on disk data -+ * structure, it has to be byte swabbed when reading in metadata that -+ * wasn't written in native endian order: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u32 snapshot; -+ __u64 offset; -+ __u64 inode; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u64 inode; -+ __u64 offset; /* Points to end of extent - sectors */ -+ __u32 snapshot; -+#else -+#error edit for your odd byteorder. -+#endif -+} __packed __aligned(4); -+ -+#define KEY_INODE_MAX ((__u64)~0ULL) -+#define KEY_OFFSET_MAX ((__u64)~0ULL) -+#define KEY_SNAPSHOT_MAX ((__u32)~0U) -+#define KEY_SIZE_MAX ((__u32)~0U) -+ -+static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) -+{ -+ return (struct bpos) { -+ .inode = inode, -+ .offset = offset, -+ .snapshot = snapshot, -+ }; -+} -+ -+#define POS_MIN SPOS(0, 0, 0) -+#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) -+#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) -+#define POS(_inode, _offset) SPOS(_inode, _offset, 0) -+ -+/* Empty placeholder struct, for container_of() */ -+struct bch_val { -+ __u64 __nothing[0]; -+}; -+ -+struct bversion { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u64 lo; -+ __u32 hi; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u32 hi; -+ __u64 lo; -+#endif -+} __packed __aligned(4); -+ -+struct bkey { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u8 pad[1]; -+ -+ struct bversion version; -+ __u32 size; /* extent size, in sectors */ -+ struct bpos p; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ struct bpos p; -+ __u32 size; /* extent size, in sectors */ -+ struct bversion version; -+ -+ __u8 pad[1]; -+#endif -+} __packed __aligned(8); -+ -+struct bkey_packed { -+ __u64 _data[0]; -+ -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+ -+ /* -+ * XXX: next incompat on disk format change, switch format and -+ * needs_whiteout - bkey_packed() will be cheaper if format is the high -+ * bits of the bitfield -+ */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ __u8 key_start[0]; -+ -+ /* -+ * We copy bkeys with struct assignment in various places, and while -+ * that shouldn't be done with packed bkeys we can't disallow it in C, -+ * and it's legal to cast a bkey to a bkey_packed - so padding it out -+ * to the same size as struct bkey should hopefully be safest. -+ */ -+ __u8 pad[sizeof(struct bkey) - 3]; -+} __packed __aligned(8); -+ -+typedef struct { -+ __le64 lo; -+ __le64 hi; -+} bch_le128; -+ -+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -+#define BKEY_U64s_MAX U8_MAX -+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) -+ -+#define KEY_PACKED_BITS_START 24 -+ -+#define KEY_FORMAT_LOCAL_BTREE 0 -+#define KEY_FORMAT_CURRENT 1 -+ -+enum bch_bkey_fields { -+ BKEY_FIELD_INODE, -+ BKEY_FIELD_OFFSET, -+ BKEY_FIELD_SNAPSHOT, -+ BKEY_FIELD_SIZE, -+ BKEY_FIELD_VERSION_HI, -+ BKEY_FIELD_VERSION_LO, -+ BKEY_NR_FIELDS, -+}; -+ -+#define bkey_format_field(name, field) \ -+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) -+ -+#define BKEY_FORMAT_CURRENT \ -+((struct bkey_format) { \ -+ .key_u64s = BKEY_U64s, \ -+ .nr_fields = BKEY_NR_FIELDS, \ -+ .bits_per_field = { \ -+ bkey_format_field(INODE, p.inode), \ -+ bkey_format_field(OFFSET, p.offset), \ -+ bkey_format_field(SNAPSHOT, p.snapshot), \ -+ bkey_format_field(SIZE, size), \ -+ bkey_format_field(VERSION_HI, version.hi), \ -+ bkey_format_field(VERSION_LO, version.lo), \ -+ }, \ -+}) -+ -+/* bkey with inline value */ -+struct bkey_i { -+ __u64 _data[0]; -+ -+ struct bkey k; -+ struct bch_val v; -+}; -+ -+#define KEY(_inode, _offset, _size) \ -+((struct bkey) { \ -+ .u64s = BKEY_U64s, \ -+ .format = KEY_FORMAT_CURRENT, \ -+ .p = POS(_inode, _offset), \ -+ .size = _size, \ -+}) -+ -+static inline void bkey_init(struct bkey *k) -+{ -+ *k = KEY(0, 0, 0); -+} -+ -+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) -+ -+#define __BKEY_PADDED(key, pad) \ -+ struct bkey_i key; __u64 key ## _pad[pad] -+ -+/* -+ * - DELETED keys are used internally to mark keys that should be ignored but -+ * override keys in composition order. Their version number is ignored. -+ * -+ * - DISCARDED keys indicate that the data is all 0s because it has been -+ * discarded. DISCARDs may have a version; if the version is nonzero the key -+ * will be persistent, otherwise the key will be dropped whenever the btree -+ * node is rewritten (like DELETED keys). -+ * -+ * - ERROR: any read of the data returns a read error, as the data was lost due -+ * to a failing device. Like DISCARDED keys, they can be removed (overridden) -+ * by new writes or cluster-wide GC. Node repair can also overwrite them with -+ * the same or a more recent version number, but not with an older version -+ * number. -+ * -+ * - WHITEOUT: for hash table btrees -+ */ -+#define BCH_BKEY_TYPES() \ -+ x(deleted, 0) \ -+ x(whiteout, 1) \ -+ x(error, 2) \ -+ x(cookie, 3) \ -+ x(hash_whiteout, 4) \ -+ x(btree_ptr, 5) \ -+ x(extent, 6) \ -+ x(reservation, 7) \ -+ x(inode, 8) \ -+ x(inode_generation, 9) \ -+ x(dirent, 10) \ -+ x(xattr, 11) \ -+ x(alloc, 12) \ -+ x(quota, 13) \ -+ x(stripe, 14) \ -+ x(reflink_p, 15) \ -+ x(reflink_v, 16) \ -+ x(inline_data, 17) \ -+ x(btree_ptr_v2, 18) \ -+ x(indirect_inline_data, 19) \ -+ x(alloc_v2, 20) \ -+ x(subvolume, 21) \ -+ x(snapshot, 22) \ -+ x(inode_v2, 23) \ -+ x(alloc_v3, 24) \ -+ x(set, 25) \ -+ x(lru, 26) \ -+ x(alloc_v4, 27) \ -+ x(backpointer, 28) \ -+ x(inode_v3, 29) \ -+ x(bucket_gens, 30) \ -+ x(snapshot_tree, 31) -+ -+enum bch_bkey_type { -+#define x(name, nr) KEY_TYPE_##name = nr, -+ BCH_BKEY_TYPES() -+#undef x -+ KEY_TYPE_MAX, -+}; -+ -+struct bch_deleted { -+ struct bch_val v; -+}; -+ -+struct bch_whiteout { -+ struct bch_val v; -+}; -+ -+struct bch_error { -+ struct bch_val v; -+}; -+ -+struct bch_cookie { -+ struct bch_val v; -+ __le64 cookie; -+}; -+ -+struct bch_hash_whiteout { -+ struct bch_val v; -+}; -+ -+struct bch_set { -+ struct bch_val v; -+}; -+ -+/* Extents */ -+ -+/* -+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally -+ * preceded by checksum/compression information (bch_extent_crc32 or -+ * bch_extent_crc64). -+ * -+ * One major determining factor in the format of extents is how we handle and -+ * represent extents that have been partially overwritten and thus trimmed: -+ * -+ * If an extent is not checksummed or compressed, when the extent is trimmed we -+ * don't have to remember the extent we originally allocated and wrote: we can -+ * merely adjust ptr->offset to point to the start of the data that is currently -+ * live. The size field in struct bkey records the current (live) size of the -+ * extent, and is also used to mean "size of region on disk that we point to" in -+ * this case. -+ * -+ * Thus an extent that is not checksummed or compressed will consist only of a -+ * list of bch_extent_ptrs, with none of the fields in -+ * bch_extent_crc32/bch_extent_crc64. -+ * -+ * When an extent is checksummed or compressed, it's not possible to read only -+ * the data that is currently live: we have to read the entire extent that was -+ * originally written, and then return only the part of the extent that is -+ * currently live. -+ * -+ * Thus, in addition to the current size of the extent in struct bkey, we need -+ * to store the size of the originally allocated space - this is the -+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, -+ * when the extent is trimmed, instead of modifying the offset field of the -+ * pointer, we keep a second smaller offset field - "offset into the original -+ * extent of the currently live region". -+ * -+ * The other major determining factor is replication and data migration: -+ * -+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated -+ * write, we will initially write all the replicas in the same format, with the -+ * same checksum type and compression format - however, when copygc runs later (or -+ * tiering/cache promotion, anything that moves data), it is not in general -+ * going to rewrite all the pointers at once - one of the replicas may be in a -+ * bucket on one device that has very little fragmentation while another lives -+ * in a bucket that has become heavily fragmented, and thus is being rewritten -+ * sooner than the rest. -+ * -+ * Thus it will only move a subset of the pointers (or in the case of -+ * tiering/cache promotion perhaps add a single pointer without dropping any -+ * current pointers), and if the extent has been partially overwritten it must -+ * write only the currently live portion (or copygc would not be able to reduce -+ * fragmentation!) - which necessitates a different bch_extent_crc format for -+ * the new pointer. -+ * -+ * But in the interests of space efficiency, we don't want to store one -+ * bch_extent_crc for each pointer if we don't have to. -+ * -+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and -+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the -+ * type of a given entry with a scheme similar to utf8 (except we're encoding a -+ * type, not a size), encoding the type in the position of the first set bit: -+ * -+ * bch_extent_crc32 - 0b1 -+ * bch_extent_ptr - 0b10 -+ * bch_extent_crc64 - 0b100 -+ * -+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and -+ * bch_extent_crc64 is the least constrained). -+ * -+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, -+ * until the next bch_extent_crc32/64. -+ * -+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer -+ * is neither checksummed nor compressed. -+ */ -+ -+/* 128 bits, sufficient for cryptographic MACs: */ -+struct bch_csum { -+ __le64 lo; -+ __le64 hi; -+} __packed __aligned(8); -+ -+#define BCH_EXTENT_ENTRY_TYPES() \ -+ x(ptr, 0) \ -+ x(crc32, 1) \ -+ x(crc64, 2) \ -+ x(crc128, 3) \ -+ x(stripe_ptr, 4) \ -+ x(rebalance, 5) -+#define BCH_EXTENT_ENTRY_MAX 6 -+ -+enum bch_extent_entry_type { -+#define x(f, n) BCH_EXTENT_ENTRY_##f = n, -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+/* Compressed/uncompressed size are stored biased by 1: */ -+struct bch_extent_crc32 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u32 type:2, -+ _compressed_size:7, -+ _uncompressed_size:7, -+ offset:7, -+ _unused:1, -+ csum_type:4, -+ compression_type:4; -+ __u32 csum; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u32 csum; -+ __u32 compression_type:4, -+ csum_type:4, -+ _unused:1, -+ offset:7, -+ _uncompressed_size:7, -+ _compressed_size:7, -+ type:2; -+#endif -+} __packed __aligned(8); -+ -+#define CRC32_SIZE_MAX (1U << 7) -+#define CRC32_NONCE_MAX 0 -+ -+struct bch_extent_crc64 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:3, -+ _compressed_size:9, -+ _uncompressed_size:9, -+ offset:9, -+ nonce:10, -+ csum_type:4, -+ compression_type:4, -+ csum_hi:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 csum_hi:16, -+ compression_type:4, -+ csum_type:4, -+ nonce:10, -+ offset:9, -+ _uncompressed_size:9, -+ _compressed_size:9, -+ type:3; -+#endif -+ __u64 csum_lo; -+} __packed __aligned(8); -+ -+#define CRC64_SIZE_MAX (1U << 9) -+#define CRC64_NONCE_MAX ((1U << 10) - 1) -+ -+struct bch_extent_crc128 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:4, -+ _compressed_size:13, -+ _uncompressed_size:13, -+ offset:13, -+ nonce:13, -+ csum_type:4, -+ compression_type:4; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 compression_type:4, -+ csum_type:4, -+ nonce:13, -+ offset:13, -+ _uncompressed_size:13, -+ _compressed_size:13, -+ type:4; -+#endif -+ struct bch_csum csum; -+} __packed __aligned(8); -+ -+#define CRC128_SIZE_MAX (1U << 13) -+#define CRC128_NONCE_MAX ((1U << 13) - 1) -+ -+/* -+ * @reservation - pointer hasn't been written to, just reserved -+ */ -+struct bch_extent_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:1, -+ cached:1, -+ unused:1, -+ unwritten:1, -+ offset:44, /* 8 petabytes */ -+ dev:8, -+ gen:8; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 gen:8, -+ dev:8, -+ offset:44, -+ unwritten:1, -+ unused:1, -+ cached:1, -+ type:1; -+#endif -+} __packed __aligned(8); -+ -+struct bch_extent_stripe_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:5, -+ block:8, -+ redundancy:4, -+ idx:47; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 idx:47, -+ redundancy:4, -+ block:8, -+ type:5; -+#endif -+}; -+ -+struct bch_extent_reservation { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:22, -+ replicas:4, -+ generation:32; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 generation:32, -+ replicas:4, -+ unused:22, -+ type:6; -+#endif -+}; -+ -+struct bch_extent_rebalance { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:7, -+ unused:33, -+ compression:8, -+ target:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 target:16, -+ compression:8, -+ unused:33, -+ type:7; -+#endif -+}; -+ -+union bch_extent_entry { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 -+ unsigned long type; -+#elif __BITS_PER_LONG == 32 -+ struct { -+ unsigned long pad; -+ unsigned long type; -+ }; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define x(f, n) struct bch_extent_##f f; -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+struct bch_btree_ptr { -+ struct bch_val v; -+ -+ __u64 _data[0]; -+ struct bch_extent_ptr start[]; -+} __packed __aligned(8); -+ -+struct bch_btree_ptr_v2 { -+ struct bch_val v; -+ -+ __u64 mem_ptr; -+ __le64 seq; -+ __le16 sectors_written; -+ __le16 flags; -+ struct bpos min_key; -+ __u64 _data[0]; -+ struct bch_extent_ptr start[]; -+} __packed __aligned(8); -+ -+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); -+ -+struct bch_extent { -+ struct bch_val v; -+ -+ __u64 _data[0]; -+ union bch_extent_entry start[]; -+} __packed __aligned(8); -+ -+struct bch_reservation { -+ struct bch_val v; -+ -+ __le32 generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+} __packed __aligned(8); -+ -+/* Maximum size (in u64s) a single pointer could be: */ -+#define BKEY_EXTENT_PTR_U64s_MAX\ -+ ((sizeof(struct bch_extent_crc128) + \ -+ sizeof(struct bch_extent_ptr)) / sizeof(__u64)) -+ -+/* Maximum possible size of an entire extent value: */ -+#define BKEY_EXTENT_VAL_U64s_MAX \ -+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -+ -+/* * Maximum possible size of an entire extent, key + value: */ -+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* Btree pointers don't carry around checksums: */ -+#define BKEY_BTREE_PTR_VAL_U64s_MAX \ -+ ((sizeof(struct bch_btree_ptr_v2) + \ -+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) -+#define BKEY_BTREE_PTR_U64s_MAX \ -+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -+ -+/* Inodes */ -+ -+#define BLOCKDEV_INODE_MAX 4096 -+ -+#define BCACHEFS_ROOT_INO 4096 -+ -+struct bch_inode { -+ struct bch_val v; -+ -+ __le64 bi_hash_seed; -+ __le32 bi_flags; -+ __le16 bi_mode; -+ __u8 fields[0]; -+} __packed __aligned(8); -+ -+struct bch_inode_v2 { -+ struct bch_val v; -+ -+ __le64 bi_journal_seq; -+ __le64 bi_hash_seed; -+ __le64 bi_flags; -+ __le16 bi_mode; -+ __u8 fields[0]; -+} __packed __aligned(8); -+ -+struct bch_inode_v3 { -+ struct bch_val v; -+ -+ __le64 bi_journal_seq; -+ __le64 bi_hash_seed; -+ __le64 bi_flags; -+ __le64 bi_sectors; -+ __le64 bi_size; -+ __le64 bi_version; -+ __u8 fields[0]; -+} __packed __aligned(8); -+ -+#define INODEv3_FIELDS_START_INITIAL 6 -+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) -+ -+struct bch_inode_generation { -+ struct bch_val v; -+ -+ __le32 bi_generation; -+ __le32 pad; -+} __packed __aligned(8); -+ -+/* -+ * bi_subvol and bi_parent_subvol are only set for subvolume roots: -+ */ -+ -+#define BCH_INODE_FIELDS_v2() \ -+ x(bi_atime, 96) \ -+ x(bi_ctime, 96) \ -+ x(bi_mtime, 96) \ -+ x(bi_otime, 96) \ -+ x(bi_size, 64) \ -+ x(bi_sectors, 64) \ -+ x(bi_uid, 32) \ -+ x(bi_gid, 32) \ -+ x(bi_nlink, 32) \ -+ x(bi_generation, 32) \ -+ x(bi_dev, 32) \ -+ x(bi_data_checksum, 8) \ -+ x(bi_compression, 8) \ -+ x(bi_project, 32) \ -+ x(bi_background_compression, 8) \ -+ x(bi_data_replicas, 8) \ -+ x(bi_promote_target, 16) \ -+ x(bi_foreground_target, 16) \ -+ x(bi_background_target, 16) \ -+ x(bi_erasure_code, 16) \ -+ x(bi_fields_set, 16) \ -+ x(bi_dir, 64) \ -+ x(bi_dir_offset, 64) \ -+ x(bi_subvol, 32) \ -+ x(bi_parent_subvol, 32) -+ -+#define BCH_INODE_FIELDS_v3() \ -+ x(bi_atime, 96) \ -+ x(bi_ctime, 96) \ -+ x(bi_mtime, 96) \ -+ x(bi_otime, 96) \ -+ x(bi_uid, 32) \ -+ x(bi_gid, 32) \ -+ x(bi_nlink, 32) \ -+ x(bi_generation, 32) \ -+ x(bi_dev, 32) \ -+ x(bi_data_checksum, 8) \ -+ x(bi_compression, 8) \ -+ x(bi_project, 32) \ -+ x(bi_background_compression, 8) \ -+ x(bi_data_replicas, 8) \ -+ x(bi_promote_target, 16) \ -+ x(bi_foreground_target, 16) \ -+ x(bi_background_target, 16) \ -+ x(bi_erasure_code, 16) \ -+ x(bi_fields_set, 16) \ -+ x(bi_dir, 64) \ -+ x(bi_dir_offset, 64) \ -+ x(bi_subvol, 32) \ -+ x(bi_parent_subvol, 32) \ -+ x(bi_nocow, 8) -+ -+/* subset of BCH_INODE_FIELDS */ -+#define BCH_INODE_OPTS() \ -+ x(data_checksum, 8) \ -+ x(compression, 8) \ -+ x(project, 32) \ -+ x(background_compression, 8) \ -+ x(data_replicas, 8) \ -+ x(promote_target, 16) \ -+ x(foreground_target, 16) \ -+ x(background_target, 16) \ -+ x(erasure_code, 16) \ -+ x(nocow, 8) -+ -+enum inode_opt_id { -+#define x(name, ...) \ -+ Inode_opt_##name, -+ BCH_INODE_OPTS() -+#undef x -+ Inode_opt_nr, -+}; -+ -+enum { -+ /* -+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL -+ * flags) -+ */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, -+ -+ __BCH_INODE_I_SIZE_DIRTY = 5, -+ __BCH_INODE_I_SECTORS_DIRTY = 6, -+ __BCH_INODE_UNLINKED = 7, -+ __BCH_INODE_BACKPTR_UNTRUSTED = 8, -+ -+ /* bits 20+ reserved for packed fields below: */ -+}; -+ -+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -+#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) -+ -+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); -+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); -+ -+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); -+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); -+ -+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); -+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); -+ -+LE64_BITMASK(INODEv3_FIELDS_START, -+ struct bch_inode_v3, bi_flags, 31, 36); -+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); -+ -+/* Dirents */ -+ -+/* -+ * Dirents (and xattrs) have to implement string lookups; since our b-tree -+ * doesn't support arbitrary length strings for the key, we instead index by a -+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset -+ * field of the key - using linear probing to resolve hash collisions. This also -+ * provides us with the readdir cookie posix requires. -+ * -+ * Linear probing requires us to use whiteouts for deletions, in the event of a -+ * collision: -+ */ -+ -+struct bch_dirent { -+ struct bch_val v; -+ -+ /* Target inode number: */ -+ union { -+ __le64 d_inum; -+ struct { /* DT_SUBVOL */ -+ __le32 d_child_subvol; -+ __le32 d_parent_subvol; -+ }; -+ }; -+ -+ /* -+ * Copy of mode bits 12-15 from the target inode - so userspace can get -+ * the filetype without having to do a stat() -+ */ -+ __u8 d_type; -+ -+ __u8 d_name[]; -+} __packed __aligned(8); -+ -+#define DT_SUBVOL 16 -+#define BCH_DT_MAX 17 -+ -+#define BCH_NAME_MAX 512 -+ -+/* Xattrs */ -+ -+#define KEY_TYPE_XATTR_INDEX_USER 0 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -+#define KEY_TYPE_XATTR_INDEX_SECURITY 4 -+ -+struct bch_xattr { -+ struct bch_val v; -+ __u8 x_type; -+ __u8 x_name_len; -+ __le16 x_val_len; -+ __u8 x_name[]; -+} __packed __aligned(8); -+ -+/* Bucket/allocation information: */ -+ -+struct bch_alloc { -+ struct bch_val v; -+ __u8 fields; -+ __u8 gen; -+ __u8 data[]; -+} __packed __aligned(8); -+ -+#define BCH_ALLOC_FIELDS_V1() \ -+ x(read_time, 16) \ -+ x(write_time, 16) \ -+ x(data_type, 8) \ -+ x(dirty_sectors, 16) \ -+ x(cached_sectors, 16) \ -+ x(oldest_gen, 8) \ -+ x(stripe, 32) \ -+ x(stripe_redundancy, 8) -+ -+enum { -+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, -+ BCH_ALLOC_FIELDS_V1() -+#undef x -+}; -+ -+struct bch_alloc_v2 { -+ struct bch_val v; -+ __u8 nr_fields; -+ __u8 gen; -+ __u8 oldest_gen; -+ __u8 data_type; -+ __u8 data[]; -+} __packed __aligned(8); -+ -+#define BCH_ALLOC_FIELDS_V2() \ -+ x(read_time, 64) \ -+ x(write_time, 64) \ -+ x(dirty_sectors, 32) \ -+ x(cached_sectors, 32) \ -+ x(stripe, 32) \ -+ x(stripe_redundancy, 8) -+ -+struct bch_alloc_v3 { -+ struct bch_val v; -+ __le64 journal_seq; -+ __le32 flags; -+ __u8 nr_fields; -+ __u8 gen; -+ __u8 oldest_gen; -+ __u8 data_type; -+ __u8 data[]; -+} __packed __aligned(8); -+ -+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) -+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) -+ -+struct bch_alloc_v4 { -+ struct bch_val v; -+ __u64 journal_seq; -+ __u32 flags; -+ __u8 gen; -+ __u8 oldest_gen; -+ __u8 data_type; -+ __u8 stripe_redundancy; -+ __u32 dirty_sectors; -+ __u32 cached_sectors; -+ __u64 io_time[2]; -+ __u32 stripe; -+ __u32 nr_external_backpointers; -+ __u64 fragmentation_lru; -+} __packed __aligned(8); -+ -+#define BCH_ALLOC_V4_U64s_V0 6 -+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) -+ -+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) -+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) -+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) -+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) -+ -+#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 -+ -+struct bch_backpointer { -+ struct bch_val v; -+ __u8 btree_id; -+ __u8 level; -+ __u8 data_type; -+ __u64 bucket_offset:40; -+ __u32 bucket_len; -+ struct bpos pos; -+} __packed __aligned(8); -+ -+#define KEY_TYPE_BUCKET_GENS_BITS 8 -+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) -+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) -+ -+struct bch_bucket_gens { -+ struct bch_val v; -+ u8 gens[KEY_TYPE_BUCKET_GENS_NR]; -+} __packed __aligned(8); -+ -+/* Quotas: */ -+ -+enum quota_types { -+ QTYP_USR = 0, -+ QTYP_GRP = 1, -+ QTYP_PRJ = 2, -+ QTYP_NR = 3, -+}; -+ -+enum quota_counters { -+ Q_SPC = 0, -+ Q_INO = 1, -+ Q_COUNTERS = 2, -+}; -+ -+struct bch_quota_counter { -+ __le64 hardlimit; -+ __le64 softlimit; -+}; -+ -+struct bch_quota { -+ struct bch_val v; -+ struct bch_quota_counter c[Q_COUNTERS]; -+} __packed __aligned(8); -+ -+/* Erasure coding */ -+ -+struct bch_stripe { -+ struct bch_val v; -+ __le16 sectors; -+ __u8 algorithm; -+ __u8 nr_blocks; -+ __u8 nr_redundant; -+ -+ __u8 csum_granularity_bits; -+ __u8 csum_type; -+ __u8 pad; -+ -+ struct bch_extent_ptr ptrs[]; -+} __packed __aligned(8); -+ -+/* Reflink: */ -+ -+struct bch_reflink_p { -+ struct bch_val v; -+ __le64 idx; -+ /* -+ * A reflink pointer might point to an indirect extent which is then -+ * later split (by copygc or rebalance). If we only pointed to part of -+ * the original indirect extent, and then one of the fragments is -+ * outside the range we point to, we'd leak a refcount: so when creating -+ * reflink pointers, we need to store pad values to remember the full -+ * range we were taking a reference on. -+ */ -+ __le32 front_pad; -+ __le32 back_pad; -+} __packed __aligned(8); -+ -+struct bch_reflink_v { -+ struct bch_val v; -+ __le64 refcount; -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+} __packed __aligned(8); -+ -+struct bch_indirect_inline_data { -+ struct bch_val v; -+ __le64 refcount; -+ u8 data[0]; -+}; -+ -+/* Inline data */ -+ -+struct bch_inline_data { -+ struct bch_val v; -+ u8 data[0]; -+}; -+ -+/* Subvolumes: */ -+ -+#define SUBVOL_POS_MIN POS(0, 1) -+#define SUBVOL_POS_MAX POS(0, S32_MAX) -+#define BCACHEFS_ROOT_SUBVOL 1 -+ -+struct bch_subvolume { -+ struct bch_val v; -+ __le32 flags; -+ __le32 snapshot; -+ __le64 inode; -+ /* -+ * Snapshot subvolumes form a tree, separate from the snapshot nodes -+ * tree - if this subvolume is a snapshot, this is the ID of the -+ * subvolume it was created from: -+ */ -+ __le32 parent; -+ __le32 pad; -+ bch_le128 otime; -+}; -+ -+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) -+/* -+ * We need to know whether a subvolume is a snapshot so we can know whether we -+ * can delete it (or whether it should just be rm -rf'd) -+ */ -+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) -+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) -+ -+/* Snapshots */ -+ -+struct bch_snapshot { -+ struct bch_val v; -+ __le32 flags; -+ __le32 parent; -+ __le32 children[2]; -+ __le32 subvol; -+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */ -+ __le32 tree; -+ __le32 depth; -+ __le32 skip[3]; -+}; -+ -+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) -+ -+/* True if a subvolume points to this snapshot node: */ -+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) -+ -+/* -+ * Snapshot trees: -+ * -+ * The snapshot_trees btree gives us persistent indentifier for each tree of -+ * bch_snapshot nodes, and allow us to record and easily find the root/master -+ * subvolume that other snapshots were created from: -+ */ -+struct bch_snapshot_tree { -+ struct bch_val v; -+ __le32 master_subvol; -+ __le32 root_snapshot; -+}; -+ -+/* LRU btree: */ -+ -+struct bch_lru { -+ struct bch_val v; -+ __le64 idx; -+} __packed __aligned(8); -+ -+#define LRU_ID_STRIPES (1U << 16) -+ -+/* Optional/variable size superblock sections: */ -+ -+struct bch_sb_field { -+ __u64 _data[0]; -+ __le32 u64s; -+ __le32 type; -+}; -+ -+#define BCH_SB_FIELDS() \ -+ x(journal, 0) \ -+ x(members, 1) \ -+ x(crypt, 2) \ -+ x(replicas_v0, 3) \ -+ x(quota, 4) \ -+ x(disk_groups, 5) \ -+ x(clean, 6) \ -+ x(replicas, 7) \ -+ x(journal_seq_blacklist, 8) \ -+ x(journal_v2, 9) \ -+ x(counters, 10) -+ -+enum bch_sb_field_type { -+#define x(f, nr) BCH_SB_FIELD_##f = nr, -+ BCH_SB_FIELDS() -+#undef x -+ BCH_SB_FIELD_NR -+}; -+ -+/* -+ * Most superblock fields are replicated in all device's superblocks - a few are -+ * not: -+ */ -+#define BCH_SINGLE_DEVICE_SB_FIELDS \ -+ ((1U << BCH_SB_FIELD_journal)| \ -+ (1U << BCH_SB_FIELD_journal_v2)) -+ -+/* BCH_SB_FIELD_journal: */ -+ -+struct bch_sb_field_journal { -+ struct bch_sb_field field; -+ __le64 buckets[0]; -+}; -+ -+struct bch_sb_field_journal_v2 { -+ struct bch_sb_field field; -+ -+ struct bch_sb_field_journal_v2_entry { -+ __le64 start; -+ __le64 nr; -+ } d[0]; -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+#define BCH_MIN_NR_NBUCKETS (1 << 6) -+ -+struct bch_member { -+ __uuid_t uuid; -+ __le64 nbuckets; /* device size */ -+ __le16 first_bucket; /* index of first bucket used */ -+ __le16 bucket_size; /* sectors */ -+ __le32 pad; -+ __le64 last_mount; /* time_t */ -+ -+ __le64 flags[2]; -+}; -+ -+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -+/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ -+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) -+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) -+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) -+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -+LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, -+ struct bch_member, flags[0], 30, 31) -+ -+#if 0 -+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -+#endif -+ -+#define BCH_MEMBER_STATES() \ -+ x(rw, 0) \ -+ x(ro, 1) \ -+ x(failed, 2) \ -+ x(spare, 3) -+ -+enum bch_member_state { -+#define x(t, n) BCH_MEMBER_STATE_##t = n, -+ BCH_MEMBER_STATES() -+#undef x -+ BCH_MEMBER_STATE_NR -+}; -+ -+struct bch_sb_field_members { -+ struct bch_sb_field field; -+ struct bch_member members[0]; -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+struct nonce { -+ __le32 d[4]; -+}; -+ -+struct bch_key { -+ __le64 key[4]; -+}; -+ -+#define BCH_KEY_MAGIC \ -+ (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ -+ ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ -+ ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ -+ ((__u64) 'e' << 48)|((__u64) 'y' << 56)) -+ -+struct bch_encrypted_key { -+ __le64 magic; -+ struct bch_key key; -+}; -+ -+/* -+ * If this field is present in the superblock, it stores an encryption key which -+ * is used encrypt all other data/metadata. The key will normally be encrypted -+ * with the key userspace provides, but if encryption has been turned off we'll -+ * just store the master key unencrypted in the superblock so we can access the -+ * previously encrypted data. -+ */ -+struct bch_sb_field_crypt { -+ struct bch_sb_field field; -+ -+ __le64 flags; -+ __le64 kdf_flags; -+ struct bch_encrypted_key key; -+}; -+ -+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); -+ -+enum bch_kdf_types { -+ BCH_KDF_SCRYPT = 0, -+ BCH_KDF_NR = 1, -+}; -+ -+/* stored as base 2 log of scrypt params: */ -+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -+ -+/* BCH_SB_FIELD_replicas: */ -+ -+#define BCH_DATA_TYPES() \ -+ x(free, 0) \ -+ x(sb, 1) \ -+ x(journal, 2) \ -+ x(btree, 3) \ -+ x(user, 4) \ -+ x(cached, 5) \ -+ x(parity, 6) \ -+ x(stripe, 7) \ -+ x(need_gc_gens, 8) \ -+ x(need_discard, 9) -+ -+enum bch_data_type { -+#define x(t, n) BCH_DATA_##t, -+ BCH_DATA_TYPES() -+#undef x -+ BCH_DATA_NR -+}; -+ -+static inline bool data_type_is_empty(enum bch_data_type type) -+{ -+ switch (type) { -+ case BCH_DATA_free: -+ case BCH_DATA_need_gc_gens: -+ case BCH_DATA_need_discard: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool data_type_is_hidden(enum bch_data_type type) -+{ -+ switch (type) { -+ case BCH_DATA_sb: -+ case BCH_DATA_journal: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+struct bch_replicas_entry_v0 { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 devs[0]; -+} __packed; -+ -+struct bch_sb_field_replicas_v0 { -+ struct bch_sb_field field; -+ struct bch_replicas_entry_v0 entries[0]; -+} __packed __aligned(8); -+ -+struct bch_replicas_entry { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 nr_required; -+ __u8 devs[0]; -+} __packed; -+ -+#define replicas_entry_bytes(_i) \ -+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) -+ -+struct bch_sb_field_replicas { -+ struct bch_sb_field field; -+ struct bch_replicas_entry entries[0]; -+} __packed __aligned(8); -+ -+/* BCH_SB_FIELD_quota: */ -+ -+struct bch_sb_quota_counter { -+ __le32 timelimit; -+ __le32 warnlimit; -+}; -+ -+struct bch_sb_quota_type { -+ __le64 flags; -+ struct bch_sb_quota_counter c[Q_COUNTERS]; -+}; -+ -+struct bch_sb_field_quota { -+ struct bch_sb_field field; -+ struct bch_sb_quota_type q[QTYP_NR]; -+} __packed __aligned(8); -+ -+/* BCH_SB_FIELD_disk_groups: */ -+ -+#define BCH_SB_LABEL_SIZE 32 -+ -+struct bch_disk_group { -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 flags[2]; -+} __packed __aligned(8); -+ -+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) -+ -+struct bch_sb_field_disk_groups { -+ struct bch_sb_field field; -+ struct bch_disk_group entries[0]; -+} __packed __aligned(8); -+ -+/* BCH_SB_FIELD_counters */ -+ -+#define BCH_PERSISTENT_COUNTERS() \ -+ x(io_read, 0) \ -+ x(io_write, 1) \ -+ x(io_move, 2) \ -+ x(bucket_invalidate, 3) \ -+ x(bucket_discard, 4) \ -+ x(bucket_alloc, 5) \ -+ x(bucket_alloc_fail, 6) \ -+ x(btree_cache_scan, 7) \ -+ x(btree_cache_reap, 8) \ -+ x(btree_cache_cannibalize, 9) \ -+ x(btree_cache_cannibalize_lock, 10) \ -+ x(btree_cache_cannibalize_lock_fail, 11) \ -+ x(btree_cache_cannibalize_unlock, 12) \ -+ x(btree_node_write, 13) \ -+ x(btree_node_read, 14) \ -+ x(btree_node_compact, 15) \ -+ x(btree_node_merge, 16) \ -+ x(btree_node_split, 17) \ -+ x(btree_node_rewrite, 18) \ -+ x(btree_node_alloc, 19) \ -+ x(btree_node_free, 20) \ -+ x(btree_node_set_root, 21) \ -+ x(btree_path_relock_fail, 22) \ -+ x(btree_path_upgrade_fail, 23) \ -+ x(btree_reserve_get_fail, 24) \ -+ x(journal_entry_full, 25) \ -+ x(journal_full, 26) \ -+ x(journal_reclaim_finish, 27) \ -+ x(journal_reclaim_start, 28) \ -+ x(journal_write, 29) \ -+ x(read_promote, 30) \ -+ x(read_bounce, 31) \ -+ x(read_split, 33) \ -+ x(read_retry, 32) \ -+ x(read_reuse_race, 34) \ -+ x(move_extent_read, 35) \ -+ x(move_extent_write, 36) \ -+ x(move_extent_finish, 37) \ -+ x(move_extent_fail, 38) \ -+ x(move_extent_alloc_mem_fail, 39) \ -+ x(copygc, 40) \ -+ x(copygc_wait, 41) \ -+ x(gc_gens_end, 42) \ -+ x(gc_gens_start, 43) \ -+ x(trans_blocked_journal_reclaim, 44) \ -+ x(trans_restart_btree_node_reused, 45) \ -+ x(trans_restart_btree_node_split, 46) \ -+ x(trans_restart_fault_inject, 47) \ -+ x(trans_restart_iter_upgrade, 48) \ -+ x(trans_restart_journal_preres_get, 49) \ -+ x(trans_restart_journal_reclaim, 50) \ -+ x(trans_restart_journal_res_get, 51) \ -+ x(trans_restart_key_cache_key_realloced, 52) \ -+ x(trans_restart_key_cache_raced, 53) \ -+ x(trans_restart_mark_replicas, 54) \ -+ x(trans_restart_mem_realloced, 55) \ -+ x(trans_restart_memory_allocation_failure, 56) \ -+ x(trans_restart_relock, 57) \ -+ x(trans_restart_relock_after_fill, 58) \ -+ x(trans_restart_relock_key_cache_fill, 59) \ -+ x(trans_restart_relock_next_node, 60) \ -+ x(trans_restart_relock_parent_for_fill, 61) \ -+ x(trans_restart_relock_path, 62) \ -+ x(trans_restart_relock_path_intent, 63) \ -+ x(trans_restart_too_many_iters, 64) \ -+ x(trans_restart_traverse, 65) \ -+ x(trans_restart_upgrade, 66) \ -+ x(trans_restart_would_deadlock, 67) \ -+ x(trans_restart_would_deadlock_write, 68) \ -+ x(trans_restart_injected, 69) \ -+ x(trans_restart_key_cache_upgrade, 70) \ -+ x(trans_traverse_all, 71) \ -+ x(transaction_commit, 72) \ -+ x(write_super, 73) \ -+ x(trans_restart_would_deadlock_recursion_limit, 74) \ -+ x(trans_restart_write_buffer_flush, 75) \ -+ x(trans_restart_split_race, 76) -+ -+enum bch_persistent_counters { -+#define x(t, n, ...) BCH_COUNTER_##t, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+ BCH_COUNTER_NR -+}; -+ -+struct bch_sb_field_counters { -+ struct bch_sb_field field; -+ __le64 d[0]; -+}; -+ -+/* -+ * On clean shutdown, store btree roots and current journal sequence number in -+ * the superblock: -+ */ -+struct jset_entry { -+ __le16 u64s; -+ __u8 btree_id; -+ __u8 level; -+ __u8 type; /* designates what this jset holds */ -+ __u8 pad[3]; -+ -+ union { -+ struct bkey_i start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct bch_sb_field_clean { -+ struct bch_sb_field field; -+ -+ __le32 flags; -+ __le16 _read_clock; /* no longer used */ -+ __le16 _write_clock; -+ __le64 journal_seq; -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct journal_seq_blacklist_entry { -+ __le64 start; -+ __le64 end; -+}; -+ -+struct bch_sb_field_journal_seq_blacklist { -+ struct bch_sb_field field; -+ -+ union { -+ struct journal_seq_blacklist_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+/* Superblock: */ -+ -+/* -+ * New versioning scheme: -+ * One common version number for all on disk data structures - superblock, btree -+ * nodes, journal entries -+ */ -+#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) -+#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) -+#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) -+ -+#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) -+ -+#define BCH_METADATA_VERSIONS() \ -+ x(bkey_renumber, BCH_VERSION(0, 10), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(inode_btree_change, BCH_VERSION(0, 11), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(snapshot, BCH_VERSION(0, 12), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(inode_backpointers, BCH_VERSION(0, 13), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(snapshot_2, BCH_VERSION(0, 15), \ -+ BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ -+ BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(reflink_p_fix, BCH_VERSION(0, 16), \ -+ BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ -+ x(subvol_dirent, BCH_VERSION(0, 17), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(inode_v2, BCH_VERSION(0, 18), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(freespace, BCH_VERSION(0, 19), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(alloc_v4, BCH_VERSION(0, 20), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(new_data_types, BCH_VERSION(0, 21), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(backpointers, BCH_VERSION(0, 22), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(inode_v3, BCH_VERSION(0, 23), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(unwritten_extents, BCH_VERSION(0, 24), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(bucket_gens, BCH_VERSION(0, 25), \ -+ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(lru_v2, BCH_VERSION(0, 26), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(fragmentation_lru, BCH_VERSION(0, 27), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(snapshot_trees, BCH_VERSION(0, 29), \ -+ RECOVERY_PASS_ALL_FSCK) \ -+ x(major_minor, BCH_VERSION(1, 0), \ -+ 0) \ -+ x(snapshot_skiplists, BCH_VERSION(1, 1), \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \ -+ x(deleted_inodes, BCH_VERSION(1, 2), \ -+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) -+ -+enum bcachefs_metadata_version { -+ bcachefs_metadata_version_min = 9, -+#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, -+ BCH_METADATA_VERSIONS() -+#undef x -+ bcachefs_metadata_version_max -+}; -+ -+static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; -+ -+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) -+ -+#define BCH_SB_SECTOR 8 -+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ -+ -+struct bch_sb_layout { -+ __uuid_t magic; /* bcachefs superblock UUID */ -+ __u8 layout_type; -+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ -+ __u8 nr_superblocks; -+ __u8 pad[5]; -+ __le64 sb_offset[61]; -+} __packed __aligned(8); -+ -+#define BCH_SB_LAYOUT_SECTOR 7 -+ -+/* -+ * @offset - sector where this sb was written -+ * @version - on disk format version -+ * @version_min - Oldest metadata version this filesystem contains; so we can -+ * safely drop compatibility code and refuse to mount filesystems -+ * we'd need it for -+ * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) -+ * @seq - incremented each time superblock is written -+ * @uuid - used for generating various magic numbers and identifying -+ * member devices, never changes -+ * @user_uuid - user visible UUID, may be changed -+ * @label - filesystem label -+ * @seq - identifies most recent superblock, incremented each time -+ * superblock is written -+ * @features - enabled incompatible features -+ */ -+struct bch_sb { -+ struct bch_csum csum; -+ __le16 version; -+ __le16 version_min; -+ __le16 pad[2]; -+ __uuid_t magic; -+ __uuid_t uuid; -+ __uuid_t user_uuid; -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 offset; -+ __le64 seq; -+ -+ __le16 block_size; -+ __u8 dev_idx; -+ __u8 nr_devices; -+ __le32 u64s; -+ -+ __le64 time_base_lo; -+ __le32 time_base_hi; -+ __le32 time_precision; -+ -+ __le64 flags[8]; -+ __le64 features[2]; -+ __le64 compat[2]; -+ -+ struct bch_sb_layout layout; -+ -+ union { -+ struct bch_sb_field start[0]; -+ __le64 _data[0]; -+ }; -+} __packed __aligned(8); -+ -+/* -+ * Flags: -+ * BCH_SB_INITALIZED - set on first mount -+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect -+ * behaviour of mount/recovery path: -+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits -+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 -+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides -+ * DATA/META_CSUM_TYPE. Also indicates encryption -+ * algorithm in use, if/when we get more than one -+ */ -+ -+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); -+ -+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); -+ -+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); -+ -+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); -+ -+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -+ -+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); -+ -+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -+LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); -+ -+LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); -+ -+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); -+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); -+ -+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -+ -+/* -+ * Max size of an extent that may require bouncing to read or write -+ * (checksummed, compressed): 64k -+ */ -+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, -+ struct bch_sb, flags[1], 14, 20); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); -+ -+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); -+ -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, -+ struct bch_sb, flags[2], 0, 4); -+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); -+ -+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -+LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); -+LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); -+LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); -+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); -+LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); -+LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); -+LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); -+LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); -+LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); -+LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); -+ -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, -+ struct bch_sb, flags[4], 60, 64); -+ -+LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, -+ struct bch_sb, flags[5], 0, 16); -+ -+static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) -+{ -+ return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); -+} -+ -+static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -+{ -+ SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); -+ SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); -+} -+ -+static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) -+{ -+ return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | -+ (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); -+} -+ -+static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) -+{ -+ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); -+ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); -+} -+ -+/* -+ * Features: -+ * -+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist -+ * reflink: gates KEY_TYPE_reflink -+ * inline_data: gates KEY_TYPE_inline_data -+ * new_siphash: gates BCH_STR_HASH_siphash -+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE -+ */ -+#define BCH_SB_FEATURES() \ -+ x(lz4, 0) \ -+ x(gzip, 1) \ -+ x(zstd, 2) \ -+ x(atomic_nlink, 3) \ -+ x(ec, 4) \ -+ x(journal_seq_blacklist_v3, 5) \ -+ x(reflink, 6) \ -+ x(new_siphash, 7) \ -+ x(inline_data, 8) \ -+ x(new_extent_overwrite, 9) \ -+ x(incompressible, 10) \ -+ x(btree_ptr_v2, 11) \ -+ x(extents_above_btree_updates, 12) \ -+ x(btree_updates_journalled, 13) \ -+ x(reflink_inline_data, 14) \ -+ x(new_varint, 15) \ -+ x(journal_no_flush, 16) \ -+ x(alloc_v2, 17) \ -+ x(extents_across_btree_nodes, 18) -+ -+#define BCH_SB_FEATURES_ALWAYS \ -+ ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ -+ (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ -+ (1ULL << BCH_FEATURE_btree_updates_journalled)|\ -+ (1ULL << BCH_FEATURE_alloc_v2)|\ -+ (1ULL << BCH_FEATURE_extents_across_btree_nodes)) -+ -+#define BCH_SB_FEATURES_ALL \ -+ (BCH_SB_FEATURES_ALWAYS| \ -+ (1ULL << BCH_FEATURE_new_siphash)| \ -+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -+ (1ULL << BCH_FEATURE_new_varint)| \ -+ (1ULL << BCH_FEATURE_journal_no_flush)) -+ -+enum bch_sb_feature { -+#define x(f, n) BCH_FEATURE_##f, -+ BCH_SB_FEATURES() -+#undef x -+ BCH_FEATURE_NR, -+}; -+ -+#define BCH_SB_COMPAT() \ -+ x(alloc_info, 0) \ -+ x(alloc_metadata, 1) \ -+ x(extents_above_btree_updates_done, 2) \ -+ x(bformat_overflow_done, 3) -+ -+enum bch_sb_compat { -+#define x(f, n) BCH_COMPAT_##f, -+ BCH_SB_COMPAT() -+#undef x -+ BCH_COMPAT_NR, -+}; -+ -+/* options: */ -+ -+#define BCH_VERSION_UPGRADE_OPTS() \ -+ x(compatible, 0) \ -+ x(incompatible, 1) \ -+ x(none, 2) -+ -+enum bch_version_upgrade_opts { -+#define x(t, n) BCH_VERSION_UPGRADE_##t = n, -+ BCH_VERSION_UPGRADE_OPTS() -+#undef x -+}; -+ -+#define BCH_REPLICAS_MAX 4U -+ -+#define BCH_BKEY_PTRS_MAX 16U -+ -+#define BCH_ERROR_ACTIONS() \ -+ x(continue, 0) \ -+ x(ro, 1) \ -+ x(panic, 2) -+ -+enum bch_error_actions { -+#define x(t, n) BCH_ON_ERROR_##t = n, -+ BCH_ERROR_ACTIONS() -+#undef x -+ BCH_ON_ERROR_NR -+}; -+ -+#define BCH_STR_HASH_TYPES() \ -+ x(crc32c, 0) \ -+ x(crc64, 1) \ -+ x(siphash_old, 2) \ -+ x(siphash, 3) -+ -+enum bch_str_hash_type { -+#define x(t, n) BCH_STR_HASH_##t = n, -+ BCH_STR_HASH_TYPES() -+#undef x -+ BCH_STR_HASH_NR -+}; -+ -+#define BCH_STR_HASH_OPTS() \ -+ x(crc32c, 0) \ -+ x(crc64, 1) \ -+ x(siphash, 2) -+ -+enum bch_str_hash_opts { -+#define x(t, n) BCH_STR_HASH_OPT_##t = n, -+ BCH_STR_HASH_OPTS() -+#undef x -+ BCH_STR_HASH_OPT_NR -+}; -+ -+#define BCH_CSUM_TYPES() \ -+ x(none, 0) \ -+ x(crc32c_nonzero, 1) \ -+ x(crc64_nonzero, 2) \ -+ x(chacha20_poly1305_80, 3) \ -+ x(chacha20_poly1305_128, 4) \ -+ x(crc32c, 5) \ -+ x(crc64, 6) \ -+ x(xxhash, 7) -+ -+enum bch_csum_type { -+#define x(t, n) BCH_CSUM_##t = n, -+ BCH_CSUM_TYPES() -+#undef x -+ BCH_CSUM_NR -+}; -+ -+static const unsigned bch_crc_bytes[] = { -+ [BCH_CSUM_none] = 0, -+ [BCH_CSUM_crc32c_nonzero] = 4, -+ [BCH_CSUM_crc32c] = 4, -+ [BCH_CSUM_crc64_nonzero] = 8, -+ [BCH_CSUM_crc64] = 8, -+ [BCH_CSUM_xxhash] = 8, -+ [BCH_CSUM_chacha20_poly1305_80] = 10, -+ [BCH_CSUM_chacha20_poly1305_128] = 16, -+}; -+ -+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -+{ -+ switch (type) { -+ case BCH_CSUM_chacha20_poly1305_80: -+ case BCH_CSUM_chacha20_poly1305_128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+#define BCH_CSUM_OPTS() \ -+ x(none, 0) \ -+ x(crc32c, 1) \ -+ x(crc64, 2) \ -+ x(xxhash, 3) -+ -+enum bch_csum_opts { -+#define x(t, n) BCH_CSUM_OPT_##t = n, -+ BCH_CSUM_OPTS() -+#undef x -+ BCH_CSUM_OPT_NR -+}; -+ -+#define BCH_COMPRESSION_TYPES() \ -+ x(none, 0) \ -+ x(lz4_old, 1) \ -+ x(gzip, 2) \ -+ x(lz4, 3) \ -+ x(zstd, 4) \ -+ x(incompressible, 5) -+ -+enum bch_compression_type { -+#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, -+ BCH_COMPRESSION_TYPES() -+#undef x -+ BCH_COMPRESSION_TYPE_NR -+}; -+ -+#define BCH_COMPRESSION_OPTS() \ -+ x(none, 0) \ -+ x(lz4, 1) \ -+ x(gzip, 2) \ -+ x(zstd, 3) -+ -+enum bch_compression_opts { -+#define x(t, n) BCH_COMPRESSION_OPT_##t = n, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ BCH_COMPRESSION_OPT_NR -+}; -+ -+/* -+ * Magic numbers -+ * -+ * The various other data structures have their own magic numbers, which are -+ * xored with the first part of the cache set's UUID -+ */ -+ -+#define BCACHE_MAGIC \ -+ UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ -+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -+#define BCHFS_MAGIC \ -+ UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ -+ 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) -+ -+#define BCACHEFS_STATFS_MAGIC 0xca451a4e -+ -+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) -+ -+static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -+{ -+ __le64 ret; -+ -+ memcpy(&ret, &sb->uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 __jset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -+} -+ -+static inline __u64 __bset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -+} -+ -+/* Journal */ -+ -+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) -+ -+#define BCH_JSET_ENTRY_TYPES() \ -+ x(btree_keys, 0) \ -+ x(btree_root, 1) \ -+ x(prio_ptrs, 2) \ -+ x(blacklist, 3) \ -+ x(blacklist_v2, 4) \ -+ x(usage, 5) \ -+ x(data_usage, 6) \ -+ x(clock, 7) \ -+ x(dev_usage, 8) \ -+ x(log, 9) \ -+ x(overwrite, 10) -+ -+enum { -+#define x(f, nr) BCH_JSET_ENTRY_##f = nr, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+ BCH_JSET_ENTRY_NR -+}; -+ -+/* -+ * Journal sequence numbers can be blacklisted: bsets record the max sequence -+ * number of all the journal entries they contain updates for, so that on -+ * recovery we can ignore those bsets that contain index updates newer that what -+ * made it into the journal. -+ * -+ * This means that we can't reuse that journal_seq - we have to skip it, and -+ * then record that we skipped it so that the next time we crash and recover we -+ * don't think there was a missing journal entry. -+ */ -+struct jset_entry_blacklist { -+ struct jset_entry entry; -+ __le64 seq; -+}; -+ -+struct jset_entry_blacklist_v2 { -+ struct jset_entry entry; -+ __le64 start; -+ __le64 end; -+}; -+ -+#define BCH_FS_USAGE_TYPES() \ -+ x(reserved, 0) \ -+ x(inodes, 1) \ -+ x(key_version, 2) -+ -+enum { -+#define x(f, nr) BCH_FS_USAGE_##f = nr, -+ BCH_FS_USAGE_TYPES() -+#undef x -+ BCH_FS_USAGE_NR -+}; -+ -+struct jset_entry_usage { -+ struct jset_entry entry; -+ __le64 v; -+} __packed; -+ -+struct jset_entry_data_usage { -+ struct jset_entry entry; -+ __le64 v; -+ struct bch_replicas_entry r; -+} __packed; -+ -+struct jset_entry_clock { -+ struct jset_entry entry; -+ __u8 rw; -+ __u8 pad[7]; -+ __le64 time; -+} __packed; -+ -+struct jset_entry_dev_usage_type { -+ __le64 buckets; -+ __le64 sectors; -+ __le64 fragmented; -+} __packed; -+ -+struct jset_entry_dev_usage { -+ struct jset_entry entry; -+ __le32 dev; -+ __u32 pad; -+ -+ __le64 buckets_ec; -+ __le64 _buckets_unavailable; /* No longer used */ -+ -+ struct jset_entry_dev_usage_type d[]; -+}; -+ -+static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) -+{ -+ return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / -+ sizeof(struct jset_entry_dev_usage_type); -+} -+ -+struct jset_entry_log { -+ struct jset_entry entry; -+ u8 d[]; -+} __packed; -+ -+/* -+ * On disk format for a journal entry: -+ * seq is monotonically increasing; every journal entry has its own unique -+ * sequence number. -+ * -+ * last_seq is the oldest journal entry that still has keys the btree hasn't -+ * flushed to disk yet. -+ * -+ * version is for on disk format changes. -+ */ -+struct jset { -+ struct bch_csum csum; -+ -+ __le64 magic; -+ __le64 seq; -+ __le32 version; -+ __le32 flags; -+ -+ __le32 u64s; /* size of d[] in u64s */ -+ -+ __u8 encrypted_start[0]; -+ -+ __le16 _read_clock; /* no longer used */ -+ __le16 _write_clock; -+ -+ /* Sequence number of oldest dirty journal entry */ -+ __le64 last_seq; -+ -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+} __packed __aligned(8); -+ -+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -+LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); -+ -+#define BCH_JOURNAL_BUCKETS_MIN 8 -+ -+/* Btree: */ -+ -+enum btree_id_flags { -+ BTREE_ID_EXTENTS = BIT(0), -+ BTREE_ID_SNAPSHOTS = BIT(1), -+ BTREE_ID_DATA = BIT(2), -+}; -+ -+#define BCH_BTREE_IDS() \ -+ x(extents, 0, BTREE_ID_EXTENTS|BTREE_ID_SNAPSHOTS|BTREE_ID_DATA,\ -+ BIT_ULL(KEY_TYPE_whiteout)| \ -+ BIT_ULL(KEY_TYPE_error)| \ -+ BIT_ULL(KEY_TYPE_cookie)| \ -+ BIT_ULL(KEY_TYPE_extent)| \ -+ BIT_ULL(KEY_TYPE_reservation)| \ -+ BIT_ULL(KEY_TYPE_reflink_p)| \ -+ BIT_ULL(KEY_TYPE_inline_data)) \ -+ x(inodes, 1, BTREE_ID_SNAPSHOTS, \ -+ BIT_ULL(KEY_TYPE_whiteout)| \ -+ BIT_ULL(KEY_TYPE_inode)| \ -+ BIT_ULL(KEY_TYPE_inode_v2)| \ -+ BIT_ULL(KEY_TYPE_inode_v3)| \ -+ BIT_ULL(KEY_TYPE_inode_generation)) \ -+ x(dirents, 2, BTREE_ID_SNAPSHOTS, \ -+ BIT_ULL(KEY_TYPE_whiteout)| \ -+ BIT_ULL(KEY_TYPE_hash_whiteout)| \ -+ BIT_ULL(KEY_TYPE_dirent)) \ -+ x(xattrs, 3, BTREE_ID_SNAPSHOTS, \ -+ BIT_ULL(KEY_TYPE_whiteout)| \ -+ BIT_ULL(KEY_TYPE_cookie)| \ -+ BIT_ULL(KEY_TYPE_hash_whiteout)| \ -+ BIT_ULL(KEY_TYPE_xattr)) \ -+ x(alloc, 4, 0, \ -+ BIT_ULL(KEY_TYPE_alloc)| \ -+ BIT_ULL(KEY_TYPE_alloc_v2)| \ -+ BIT_ULL(KEY_TYPE_alloc_v3)| \ -+ BIT_ULL(KEY_TYPE_alloc_v4)) \ -+ x(quotas, 5, 0, \ -+ BIT_ULL(KEY_TYPE_quota)) \ -+ x(stripes, 6, 0, \ -+ BIT_ULL(KEY_TYPE_stripe)) \ -+ x(reflink, 7, BTREE_ID_EXTENTS|BTREE_ID_DATA, \ -+ BIT_ULL(KEY_TYPE_reflink_v)| \ -+ BIT_ULL(KEY_TYPE_indirect_inline_data)) \ -+ x(subvolumes, 8, 0, \ -+ BIT_ULL(KEY_TYPE_subvolume)) \ -+ x(snapshots, 9, 0, \ -+ BIT_ULL(KEY_TYPE_snapshot)) \ -+ x(lru, 10, 0, \ -+ BIT_ULL(KEY_TYPE_set)) \ -+ x(freespace, 11, BTREE_ID_EXTENTS, \ -+ BIT_ULL(KEY_TYPE_set)) \ -+ x(need_discard, 12, 0, \ -+ BIT_ULL(KEY_TYPE_set)) \ -+ x(backpointers, 13, 0, \ -+ BIT_ULL(KEY_TYPE_backpointer)) \ -+ x(bucket_gens, 14, 0, \ -+ BIT_ULL(KEY_TYPE_bucket_gens)) \ -+ x(snapshot_trees, 15, 0, \ -+ BIT_ULL(KEY_TYPE_snapshot_tree)) \ -+ x(deleted_inodes, 16, BTREE_ID_SNAPSHOTS, \ -+ BIT_ULL(KEY_TYPE_set)) -+ -+enum btree_id { -+#define x(name, nr, ...) BTREE_ID_##name = nr, -+ BCH_BTREE_IDS() -+#undef x -+ BTREE_ID_NR -+}; -+ -+#define BTREE_MAX_DEPTH 4U -+ -+/* Btree nodes */ -+ -+/* -+ * Btree nodes -+ * -+ * On disk a btree node is a list/log of these; within each set the keys are -+ * sorted -+ */ -+struct bset { -+ __le64 seq; -+ -+ /* -+ * Highest journal entry this bset contains keys for. -+ * If on recovery we don't see that journal entry, this bset is ignored: -+ * this allows us to preserve the order of all index updates after a -+ * crash, since the journal records a total order of all index updates -+ * and anything that didn't make it to the journal doesn't get used. -+ */ -+ __le64 journal_seq; -+ -+ __le32 flags; -+ __le16 version; -+ __le16 u64s; /* count of d[] in u64s */ -+ -+ union { -+ struct bkey_packed start[0]; -+ __u64 _data[0]; -+ }; -+} __packed __aligned(8); -+ -+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); -+ -+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, -+ struct bset, flags, 5, 6); -+ -+/* Sector offset within the btree node: */ -+LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); -+ -+struct btree_node { -+ struct bch_csum csum; -+ __le64 magic; -+ -+ /* this flags field is encrypted, unlike bset->flags: */ -+ __le64 flags; -+ -+ /* Closed interval: */ -+ struct bpos min_key; -+ struct bpos max_key; -+ struct bch_extent_ptr _ptr; /* not used anymore */ -+ struct bkey_format format; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __packed __aligned(8); -+ -+LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); -+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, -+ struct btree_node, flags, 8, 9); -+LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); -+/* 25-32 unused */ -+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); -+ -+static inline __u64 BTREE_NODE_ID(struct btree_node *n) -+{ -+ return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); -+} -+ -+static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) -+{ -+ SET_BTREE_NODE_ID_LO(n, v); -+ SET_BTREE_NODE_ID_HI(n, v >> 4); -+} -+ -+struct btree_node_entry { -+ struct bch_csum csum; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ }; -+ }; -+} __packed __aligned(8); -+ -+#endif /* _BCACHEFS_FORMAT_H */ -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -new file mode 100644 -index 000000000..f05881f7e ---- /dev/null -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -0,0 +1,368 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IOCTL_H -+#define _BCACHEFS_IOCTL_H -+ -+#include -+#include -+#include "bcachefs_format.h" -+ -+/* -+ * Flags common to multiple ioctls: -+ */ -+#define BCH_FORCE_IF_DATA_LOST (1 << 0) -+#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) -+ -+#define BCH_FORCE_IF_LOST \ -+ (BCH_FORCE_IF_DATA_LOST| \ -+ BCH_FORCE_IF_METADATA_LOST) -+#define BCH_FORCE_IF_DEGRADED \ -+ (BCH_FORCE_IF_DATA_DEGRADED| \ -+ BCH_FORCE_IF_METADATA_DEGRADED) -+ -+/* -+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname -+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the -+ * filesystem: -+ */ -+#define BCH_BY_INDEX (1 << 4) -+ -+/* -+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem -+ * wide superblock: -+ */ -+#define BCH_READ_DEV (1 << 5) -+ -+/* global control dev: */ -+ -+/* These are currently broken, and probably unnecessary: */ -+#if 0 -+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) -+ -+struct bch_ioctl_assemble { -+ __u32 flags; -+ __u32 nr_devs; -+ __u64 pad; -+ __u64 devs[]; -+}; -+ -+struct bch_ioctl_incremental { -+ __u32 flags; -+ __u64 pad; -+ __u64 dev; -+}; -+#endif -+ -+/* filesystem ioctls: */ -+ -+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) -+ -+/* These only make sense when we also have incremental assembly */ -+#if 0 -+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -+#define BCH_IOCTL_STOP _IO(0xbc, 3) -+#endif -+ -+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -+#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) -+ -+#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) -+#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) -+ -+/* ioctl below act on a particular file, not the filesystem as a whole: */ -+ -+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) -+ -+/* -+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID -+ * -+ * Returns user visible UUID, not internal UUID (which may not ever be changed); -+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with -+ * this UUID. -+ */ -+struct bch_ioctl_query_uuid { -+ __uuid_t uuid; -+}; -+ -+#if 0 -+struct bch_ioctl_start { -+ __u32 flags; -+ __u32 pad; -+}; -+#endif -+ -+/* -+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem -+ * -+ * The specified device must not be open or in use. On success, the new device -+ * will be an online member of the filesystem just like any other member. -+ * -+ * The device must first be prepared by userspace by formatting with a bcachefs -+ * superblock, which is only used for passing in superblock options/parameters -+ * for that device (in struct bch_member). The new device's superblock should -+ * not claim to be a member of any existing filesystem - UUIDs on it will be -+ * ignored. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem -+ * -+ * Any data present on @dev will be permanently deleted, and @dev will be -+ * removed from its slot in the filesystem's list of member devices. The device -+ * may be either offline or offline. -+ * -+ * Will fail removing @dev would leave us with insufficient read write devices -+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are -+ * set. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem -+ * but is not open (e.g. because we started in degraded mode), bring it online -+ * -+ * all existing data on @dev will be available once the device is online, -+ * exactly as if @dev was present when the filesystem was first mounted -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that -+ * block device, without removing it from the filesystem (so it can be brought -+ * back online later) -+ * -+ * Data present on @dev will be unavailable while @dev is offline (unless -+ * replicated), but will still be intact and untouched if @dev is brought back -+ * online -+ * -+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would -+ * leave us with insufficient read write devices or degraded/unavailable data, -+ * unless the approprate BCH_FORCE_IF_* flags are set. -+ */ -+ -+struct bch_ioctl_disk { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem -+ * -+ * @new_state - one of the bch_member_state states (rw, ro, failed, -+ * spare) -+ * -+ * Will refuse to change member state if we would then have insufficient devices -+ * to write to, or if it would result in degraded data (when @new_state is -+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. -+ */ -+struct bch_ioctl_disk_set_state { -+ __u32 flags; -+ __u8 new_state; -+ __u8 pad[3]; -+ __u64 dev; -+}; -+ -+enum bch_data_ops { -+ BCH_DATA_OP_SCRUB = 0, -+ BCH_DATA_OP_REREPLICATE = 1, -+ BCH_DATA_OP_MIGRATE = 2, -+ BCH_DATA_OP_REWRITE_OLD_NODES = 3, -+ BCH_DATA_OP_NR = 4, -+}; -+ -+/* -+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. -+ * scrub, rereplicate, migrate). -+ * -+ * This ioctl kicks off a job in the background, and returns a file descriptor. -+ * Reading from the file descriptor returns a struct bch_ioctl_data_event, -+ * indicating current progress, and closing the file descriptor will stop the -+ * job. The file descriptor is O_CLOEXEC. -+ */ -+struct bch_ioctl_data { -+ __u16 op; -+ __u8 start_btree; -+ __u8 end_btree; -+ __u32 flags; -+ -+ struct bpos start_pos; -+ struct bpos end_pos; -+ -+ union { -+ struct { -+ __u32 dev; -+ __u32 pad; -+ } migrate; -+ struct { -+ __u64 pad[8]; -+ }; -+ }; -+} __packed __aligned(8); -+ -+enum bch_data_event { -+ BCH_DATA_EVENT_PROGRESS = 0, -+ /* XXX: add an event for reporting errors */ -+ BCH_DATA_EVENT_NR = 1, -+}; -+ -+struct bch_ioctl_data_progress { -+ __u8 data_type; -+ __u8 btree_id; -+ __u8 pad[2]; -+ struct bpos pos; -+ -+ __u64 sectors_done; -+ __u64 sectors_total; -+} __packed __aligned(8); -+ -+struct bch_ioctl_data_event { -+ __u8 type; -+ __u8 pad[7]; -+ union { -+ struct bch_ioctl_data_progress p; -+ __u64 pad2[15]; -+ }; -+} __packed __aligned(8); -+ -+struct bch_replicas_usage { -+ __u64 sectors; -+ struct bch_replicas_entry r; -+} __packed; -+ -+static inline struct bch_replicas_usage * -+replicas_usage_next(struct bch_replicas_usage *u) -+{ -+ return (void *) u + replicas_entry_bytes(&u->r) + 8; -+} -+ -+/* -+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage -+ * -+ * Returns disk space usage broken out by data type, number of replicas, and -+ * by component device -+ * -+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries -+ * -+ * On success, @replica_entries_bytes will be changed to indicate the number of -+ * bytes actually used. -+ * -+ * Returns -ERANGE if @replica_entries_bytes was too small -+ */ -+struct bch_ioctl_fs_usage { -+ __u64 capacity; -+ __u64 used; -+ __u64 online_reserved; -+ __u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ -+ __u32 replica_entries_bytes; -+ __u32 pad; -+ -+ struct bch_replicas_usage replicas[0]; -+}; -+ -+/* -+ * BCH_IOCTL_DEV_USAGE: query device disk space usage -+ * -+ * Returns disk space usage broken out by data type - both by buckets and -+ * sectors. -+ */ -+struct bch_ioctl_dev_usage { -+ __u64 dev; -+ __u32 flags; -+ __u8 state; -+ __u8 pad[7]; -+ -+ __u32 bucket_size; -+ __u64 nr_buckets; -+ -+ __u64 buckets_ec; -+ -+ struct bch_ioctl_dev_usage_type { -+ __u64 buckets; -+ __u64 sectors; -+ __u64 fragmented; -+ } d[BCH_DATA_NR]; -+}; -+ -+/* -+ * BCH_IOCTL_READ_SUPER: read filesystem superblock -+ * -+ * Equivalent to reading the superblock directly from the block device, except -+ * avoids racing with the kernel writing the superblock or having to figure out -+ * which block device to read -+ * -+ * @sb - buffer to read into -+ * @size - size of userspace allocated buffer -+ * @dev - device to read superblock for, if BCH_READ_DEV flag is -+ * specified -+ * -+ * Returns -ERANGE if buffer provided is too small -+ */ -+struct bch_ioctl_read_super { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 size; -+ __u64 sb; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to -+ * determine if disk is a (online) member - if so, returns device's index -+ * -+ * Returns -ENOENT if not found -+ */ -+struct bch_ioctl_disk_get_idx { -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device -+ * -+ * @dev - member to resize -+ * @nbuckets - new number of buckets -+ */ -+struct bch_ioctl_disk_resize { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 nbuckets; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device -+ * -+ * @dev - member to resize -+ * @nbuckets - new number of buckets -+ */ -+struct bch_ioctl_disk_resize_journal { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 nbuckets; -+}; -+ -+struct bch_ioctl_subvolume { -+ __u32 flags; -+ __u32 dirfd; -+ __u16 mode; -+ __u16 pad[3]; -+ __u64 dst_ptr; -+ __u64 src_ptr; -+}; -+ -+#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) -+#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) -+ -+#endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c -new file mode 100644 -index 000000000..0a5bfe6e9 ---- /dev/null -+++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1107 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "bkey_cmp.h" -+#include "bkey_methods.h" -+#include "bset.h" -+#include "util.h" -+ -+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; -+ -+void bch2_bkey_packed_to_binary_text(struct printbuf *out, -+ const struct bkey_format *f, -+ const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(f, k); -+ unsigned word_bits = 64 - high_bit_offset; -+ unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; -+ u64 v = *p & (~0ULL >> high_bit_offset); -+ -+ if (!nr_key_bits) { -+ prt_str(out, "(empty)"); -+ return; -+ } -+ -+ while (1) { -+ unsigned next_key_bits = nr_key_bits; -+ -+ if (nr_key_bits < 64) { -+ v >>= 64 - nr_key_bits; -+ next_key_bits = 0; -+ } else { -+ next_key_bits -= 64; -+ } -+ -+ bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); -+ -+ if (!next_key_bits) -+ break; -+ -+ prt_char(out, ' '); -+ -+ p = next_word(p); -+ v = *p; -+ word_bits = 64; -+ nr_key_bits = next_key_bits; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) -+{ -+ struct bkey tmp; -+ -+ BUG_ON(bkeyp_val_u64s(format, packed) != -+ bkey_val_u64s(unpacked)); -+ -+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); -+ -+ tmp = __bch2_bkey_unpack_key(format, packed); -+ -+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", -+ format->key_u64s, -+ format->bits_per_field[0], -+ format->bits_per_field[1], -+ format->bits_per_field[2], -+ format->bits_per_field[3], -+ format->bits_per_field[4]); -+ -+ prt_printf(&buf, "compiled unpack: "); -+ bch2_bkey_to_text(&buf, unpacked); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "c unpack: "); -+ bch2_bkey_to_text(&buf, &tmp); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "compiled unpack: "); -+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, -+ (struct bkey_packed *) unpacked); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "c unpack: "); -+ bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, -+ (struct bkey_packed *) &tmp); -+ prt_newline(&buf); -+ -+ panic("%s", buf.buf); -+ } -+} -+ -+#else -+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) {} -+#endif -+ -+struct pack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct pack_state pack_state_init(const struct bkey_format *format, -+ struct bkey_packed *k) -+{ -+ u64 *p = high_word(format, k); -+ -+ return (struct pack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = 0, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static void pack_state_finish(struct pack_state *state, -+ struct bkey_packed *k) -+{ -+ EBUG_ON(state->p < k->_data); -+ EBUG_ON(state->p >= k->_data + state->format->key_u64s); -+ -+ *state->p = state->w; -+} -+ -+struct unpack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ const u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct unpack_state unpack_state_init(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(format, k); -+ -+ return (struct unpack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = *p << high_bit_offset, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static u64 get_inc_field(struct unpack_state *state, unsigned field) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (bits >= state->bits) { -+ v = state->w >> (64 - bits); -+ bits -= state->bits; -+ -+ state->p = next_word(state->p); -+ state->w = *state->p; -+ state->bits = 64; -+ } -+ -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ v |= (state->w >> 1) >> (63 - bits); -+ state->w <<= bits; -+ state->bits -= bits; -+ -+ return v + offset; -+} -+ -+__always_inline -+static void __set_inc_field(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ -+ if (bits) { -+ if (bits > state->bits) { -+ bits -= state->bits; -+ /* avoid shift by 64 if bits is 64 - bits is never 0 here: */ -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ } -+} -+ -+__always_inline -+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (v < offset) -+ return false; -+ -+ v -= offset; -+ -+ if (fls64(v) > bits) -+ return false; -+ -+ __set_inc_field(state, field, v); -+ return true; -+} -+ -+/* -+ * Note: does NOT set out->format (we don't know what it should be here!) -+ * -+ * Also: doesn't work on extents - it doesn't preserve the invariant that -+ * if k is packed bkey_start_pos(k) will successfully pack -+ */ -+static bool bch2_bkey_transform_key(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ struct pack_state out_s = pack_state_init(out_f, out); -+ struct unpack_state in_s = unpack_state_init(in_f, in); -+ u64 *w = out->_data; -+ unsigned i; -+ -+ *w = 0; -+ -+ for (i = 0; i < BKEY_NR_FIELDS; i++) -+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) -+ return false; -+ -+ /* Can't happen because the val would be too big to unpack: */ -+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); -+ -+ pack_state_finish(&out_s, out); -+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ return true; -+} -+ -+bool bch2_bkey_transform(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) -+ return false; -+ -+ memcpy_u64s((u64 *) out + out_f->key_u64s, -+ (u64 *) in + in_f->key_u64s, -+ (in->u64s - in_f->key_u64s)); -+ return true; -+} -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bkey out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); -+ -+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; -+ out.format = KEY_FORMAT_CURRENT; -+ out.needs_whiteout = in->needs_whiteout; -+ out.type = in->type; -+ out.pad[0] = 0; -+ -+#define x(id, field) out.field = get_inc_field(&state, id); -+ bkey_fields() -+#undef x -+ -+ return out; -+} -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bpos out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ -+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); -+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); -+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); -+ -+ return out; -+} -+#endif -+ -+/** -+ * bch2_bkey_pack_key -- pack just the key, not the value -+ */ -+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, -+ const struct bkey_format *format) -+{ -+ struct pack_state state = pack_state_init(format, out); -+ u64 *w = out->_data; -+ -+ EBUG_ON((void *) in == (void *) out); -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->format != KEY_FORMAT_CURRENT); -+ -+ *w = 0; -+ -+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; -+ bkey_fields() -+#undef x -+ pack_state_finish(&state, out); -+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ bch2_bkey_pack_verify(out, in, format); -+ return true; -+} -+ -+/** -+ * bch2_bkey_unpack -- unpack the key and the value -+ */ -+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, -+ const struct bkey_packed *src) -+{ -+ __bkey_unpack_key(b, &dst->k, src); -+ -+ memcpy_u64s(&dst->v, -+ bkeyp_val(&b->format, src), -+ bkeyp_val_u64s(&b->format, src)); -+} -+ -+/** -+ * bch2_bkey_pack -- pack the key and the value -+ */ -+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, -+ const struct bkey_format *format) -+{ -+ struct bkey_packed tmp; -+ -+ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) -+ return false; -+ -+ memmove_u64s((u64 *) out + format->key_u64s, -+ &in->v, -+ bkey_val_u64s(&in->k)); -+ memcpy_u64s_small(out, &tmp, format->key_u64s); -+ -+ return true; -+} -+ -+__always_inline -+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ bool ret = true; -+ -+ EBUG_ON(v < offset); -+ v -= offset; -+ -+ if (fls64(v) > bits) { -+ v = ~(~0ULL << bits); -+ ret = false; -+ } -+ -+ __set_inc_field(state, field, v); -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static bool bkey_packed_successor(struct bkey_packed *out, -+ const struct btree *b, -+ struct bkey_packed k) -+{ -+ const struct bkey_format *f = &b->format; -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned first_bit, offset; -+ u64 *p; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ if (!nr_key_bits) -+ return false; -+ -+ *out = k; -+ -+ first_bit = high_bit_offset + nr_key_bits - 1; -+ p = nth_word(high_word(f, out), first_bit >> 6); -+ offset = 63 - (first_bit & 63); -+ -+ while (nr_key_bits) { -+ unsigned bits = min(64 - offset, nr_key_bits); -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if ((*p & mask) != mask) { -+ *p += 1ULL << offset; -+ EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); -+ return true; -+ } -+ -+ *p &= ~mask; -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ offset = 0; -+ } -+ -+ return false; -+} -+ -+static bool bkey_format_has_too_big_fields(const struct bkey_format *f) -+{ -+ for (unsigned i = 0; i < f->nr_fields; i++) { -+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; -+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); -+ u64 packed_max = f->bits_per_field[i] -+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) -+ : 0; -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (packed_max + field_offset < packed_max || -+ packed_max + field_offset > unpacked_max) -+ return true; -+ } -+ -+ return false; -+} -+#endif -+ -+/* -+ * Returns a packed key that compares <= in -+ * -+ * This is used in bset_search_tree(), where we need a packed pos in order to be -+ * able to compare against the keys in the auxiliary search tree - and it's -+ * legal to use a packed pos that isn't equivalent to the original pos, -+ * _provided_ it compares <= to the original pos. -+ */ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, -+ struct bpos in, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct pack_state state = pack_state_init(f, out); -+ u64 *w = out->_data; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos orig = in; -+#endif -+ bool exact = true; -+ unsigned i; -+ -+ /* -+ * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 -+ * byte header, but pack_pos() won't if the len/version fields are big -+ * enough - we need to make sure to zero them out: -+ */ -+ for (i = 0; i < f->key_u64s; i++) -+ w[i] = 0; -+ -+ if (unlikely(in.snapshot < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { -+ if (!in.offset-- && -+ !in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.offset < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { -+ if (!in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.inode < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) -+ return BKEY_PACK_POS_FAIL; -+ -+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) -+ exact = false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = f->key_u64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->type = KEY_TYPE_deleted; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (exact) { -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -+ } else { -+ struct bkey_packed successor; -+ -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -+ BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0 && -+ !bkey_format_has_too_big_fields(f)); -+ } -+#endif -+ -+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -+} -+ -+void bch2_bkey_format_init(struct bkey_format_state *s) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) -+ s->field_min[i] = U64_MAX; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) -+ s->field_max[i] = 0; -+ -+ /* Make sure we can store a size of 0: */ -+ s->field_min[BKEY_FIELD_SIZE] = 0; -+} -+ -+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -+{ -+ unsigned field = 0; -+ -+ __bkey_format_add(s, field++, p.inode); -+ __bkey_format_add(s, field++, p.offset); -+ __bkey_format_add(s, field++, p.snapshot); -+} -+ -+/* -+ * We don't want it to be possible for the packed format to represent fields -+ * bigger than a u64... that will cause confusion and issues (like with -+ * bkey_packed_successor()) -+ */ -+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, -+ unsigned bits, u64 offset) -+{ -+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; -+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); -+ -+ bits = min(bits, unpacked_bits); -+ -+ offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); -+ -+ f->bits_per_field[i] = bits; -+ f->field_offset[i] = cpu_to_le64(offset); -+} -+ -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ struct bkey_format ret = { -+ .nr_fields = BKEY_NR_FIELDS, -+ }; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { -+ s->field_min[i] = min(s->field_min[i], s->field_max[i]); -+ -+ set_format_field(&ret, i, -+ fls64(s->field_max[i] - s->field_min[i]), -+ s->field_min[i]); -+ -+ bits += ret.bits_per_field[i]; -+ } -+ -+ /* allow for extent merging: */ -+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { -+ unsigned b = min(4U, 32U - ret.bits_per_field[BKEY_FIELD_SIZE]); -+ -+ ret.bits_per_field[BKEY_FIELD_SIZE] += b; -+ bits += b; -+ } -+ -+ ret.key_u64s = DIV_ROUND_UP(bits, 64); -+ -+ /* if we have enough spare bits, round fields up to nearest byte */ -+ bits = ret.key_u64s * 64 - bits; -+ -+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { -+ unsigned r = round_up(ret.bits_per_field[i], 8) - -+ ret.bits_per_field[i]; -+ -+ if (r <= bits) { -+ set_format_field(&ret, i, -+ ret.bits_per_field[i] + r, -+ le64_to_cpu(ret.field_offset[i])); -+ bits -= r; -+ } -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ { -+ struct printbuf buf = PRINTBUF; -+ -+ BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); -+ printbuf_exit(&buf); -+ } -+#endif -+ return ret; -+} -+ -+int bch2_bkey_format_invalid(struct bch_fs *c, -+ struct bkey_format *f, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ -+ if (f->nr_fields != BKEY_NR_FIELDS) { -+ prt_printf(err, "incorrect number of fields: got %u, should be %u", -+ f->nr_fields, BKEY_NR_FIELDS); -+ return -BCH_ERR_invalid; -+ } -+ -+ /* -+ * Verify that the packed format can't represent fields larger than the -+ * unpacked format: -+ */ -+ for (i = 0; i < f->nr_fields; i++) { -+ if (!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) { -+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; -+ u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); -+ u64 packed_max = f->bits_per_field[i] -+ ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) -+ : 0; -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (packed_max + field_offset < packed_max || -+ packed_max + field_offset > unpacked_max) { -+ prt_printf(err, "field %u too large: %llu + %llu > %llu", -+ i, packed_max, field_offset, unpacked_max); -+ return -BCH_ERR_invalid; -+ } -+ } -+ -+ bits += f->bits_per_field[i]; -+ } -+ -+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) { -+ prt_printf(err, "incorrect key_u64s: got %u, should be %u", -+ f->key_u64s, DIV_ROUND_UP(bits, 64)); -+ return -BCH_ERR_invalid; -+ } -+ -+ return 0; -+} -+ -+void bch2_bkey_format_to_text(struct printbuf *out, const struct bkey_format *f) -+{ -+ prt_printf(out, "u64s %u fields ", f->key_u64s); -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(f->bits_per_field); i++) { -+ if (i) -+ prt_str(out, ", "); -+ prt_printf(out, "%u:%llu", -+ f->bits_per_field[i], -+ le64_to_cpu(f->field_offset[i])); -+ } -+} -+ -+/* -+ * Most significant differing bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, -+ const struct bkey_packed *l_k, -+ const struct bkey_packed *r_k) -+{ -+ const u64 *l = high_word(&b->format, l_k); -+ const u64 *r = high_word(&b->format, r_k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned word_bits = 64 - high_bit_offset; -+ u64 l_v, r_v; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ /* for big endian, skip past header */ -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (nr_key_bits) { -+ if (nr_key_bits < word_bits) { -+ l_v >>= word_bits - nr_key_bits; -+ r_v >>= word_bits - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= word_bits; -+ } -+ -+ if (l_v != r_v) -+ return fls64(l_v ^ r_v) - 1 + nr_key_bits; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ word_bits = 64; -+ } -+ -+ return 0; -+} -+ -+/* -+ * First set bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(&b->format, k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned ret = 0, offset; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ offset = nr_key_bits; -+ while (offset > 64) { -+ p = next_word(p); -+ offset -= 64; -+ } -+ -+ offset = 64 - offset; -+ -+ while (nr_key_bits) { -+ unsigned bits = nr_key_bits + offset < 64 -+ ? nr_key_bits -+ : 64 - offset; -+ -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if (*p & mask) -+ return ret + __ffs64(*p & mask) - offset; -+ -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ ret += bits; -+ offset = 0; -+ } -+ -+ return 0; -+} -+ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ -+#define I(_x) (*(out)++ = (_x)) -+#define I1(i0) I(i0) -+#define I2(i0, i1) (I1(i0), I(i1)) -+#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) -+ -+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, -+ enum bch_bkey_fields field, -+ unsigned dst_offset, unsigned dst_size, -+ bool *eax_zeroed) -+{ -+ unsigned bits = format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(format->field_offset[field]); -+ unsigned i, byte, bit_offset, align, shl, shr; -+ -+ if (!bits && !offset) { -+ if (!*eax_zeroed) { -+ /* xor eax, eax */ -+ I2(0x31, 0xc0); -+ } -+ -+ *eax_zeroed = true; -+ goto set_field; -+ } -+ -+ if (!bits) { -+ /* just return offset: */ -+ -+ switch (dst_size) { -+ case 8: -+ if (offset > S32_MAX) { -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ -+ I3(0xc7, 0x47, dst_offset + 4); -+ memcpy(out, (void *) &offset + 4, 4); -+ out += 4; -+ } else { -+ /* mov [rdi + dst_offset], offset */ -+ /* sign extended */ -+ I4(0x48, 0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+ } -+ -+ bit_offset = format->key_u64s * 64; -+ for (i = 0; i <= field; i++) -+ bit_offset -= format->bits_per_field[i]; -+ -+ byte = bit_offset / 8; -+ bit_offset -= byte * 8; -+ -+ *eax_zeroed = false; -+ -+ if (bit_offset == 0 && bits == 8) { -+ /* movzx eax, BYTE PTR [rsi + imm8] */ -+ I4(0x0f, 0xb6, 0x46, byte); -+ } else if (bit_offset == 0 && bits == 16) { -+ /* movzx eax, WORD PTR [rsi + imm8] */ -+ I4(0x0f, 0xb7, 0x46, byte); -+ } else if (bit_offset + bits <= 32) { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 32); -+ -+ /* mov eax, [rsi + imm8] */ -+ I3(0x8b, 0x46, byte); -+ -+ if (bit_offset) { -+ /* shr eax, imm8 */ -+ I3(0xc1, 0xe8, bit_offset); -+ } -+ -+ if (bit_offset + bits < 32) { -+ unsigned mask = ~0U >> (32 - bits); -+ -+ /* and eax, imm32 */ -+ I1(0x25); -+ memcpy(out, &mask, 4); -+ out += 4; -+ } -+ } else if (bit_offset + bits <= 64) { -+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 64); -+ -+ /* mov rax, [rsi + imm8] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ shl = 64 - bit_offset - bits; -+ shr = bit_offset + shl; -+ -+ if (shl) { -+ /* shl rax, imm8 */ -+ I4(0x48, 0xc1, 0xe0, shl); -+ } -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } else { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 96); -+ -+ /* mov rax, [rsi + byte] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ /* mov edx, [rsi + byte + 8] */ -+ I3(0x8b, 0x56, byte + 8); -+ -+ /* bits from next word: */ -+ shr = bit_offset + bits - 64; -+ BUG_ON(shr > bit_offset); -+ -+ /* shr rax, bit_offset */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ -+ /* shl rdx, imm8 */ -+ I4(0x48, 0xc1, 0xe2, 64 - shr); -+ -+ /* or rax, rdx */ -+ I3(0x48, 0x09, 0xd0); -+ -+ shr = bit_offset - shr; -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } -+ -+ /* rax += offset: */ -+ if (offset > S32_MAX) { -+ /* mov rdx, imm64 */ -+ I2(0x48, 0xba); -+ memcpy(out, &offset, 8); -+ out += 8; -+ /* add %rdx, %rax */ -+ I3(0x48, 0x01, 0xd0); -+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { -+ /* add rax, imm32 */ -+ I2(0x48, 0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } else if (offset) { -+ /* add eax, imm32 */ -+ I1(0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+set_field: -+ switch (dst_size) { -+ case 8: -+ /* mov [rdi + dst_offset], rax */ -+ I4(0x48, 0x89, 0x47, dst_offset); -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], eax */ -+ I3(0x89, 0x47, dst_offset); -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+} -+ -+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -+{ -+ bool eax_zeroed = false; -+ u8 *out = _out; -+ -+ /* -+ * rdi: dst - unpacked key -+ * rsi: src - packed key -+ */ -+ -+ /* k->u64s, k->format, k->type */ -+ -+ /* mov eax, [rsi] */ -+ I2(0x8b, 0x06); -+ -+ /* add eax, BKEY_U64s - format->key_u64s */ -+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); -+ -+ /* and eax, imm32: mask out k->pad: */ -+ I5(0x25, 0xff, 0xff, 0xff, 0); -+ -+ /* mov [rdi], eax */ -+ I2(0x89, 0x07); -+ -+#define x(id, field) \ -+ out = compile_bkey_field(format, out, id, \ -+ offsetof(struct bkey, field), \ -+ sizeof(((struct bkey *) NULL)->field), \ -+ &eax_zeroed); -+ bkey_fields() -+#undef x -+ -+ /* retq */ -+ I1(0xc3); -+ -+ return (void *) out - _out; -+} -+ -+#else -+#endif -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); -+} -+ -+__pure __flatten -+int bch2_bkey_cmp_packed(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bch2_bkey_cmp_packed_inlined(b, l, r); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ const struct bkey *l_unpacked; -+ -+ return unlikely(l_unpacked = packed_to_bkey_c(l)) -+ ? bpos_cmp(l_unpacked->p, *r) -+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+void bch2_bpos_swab(struct bpos *p) -+{ -+ u8 *l = (u8 *) p; -+ u8 *h = ((u8 *) &p[1]) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -+{ -+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; -+ u8 *l = k->key_start; -+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void) -+{ -+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); -+ struct bkey_packed p; -+ -+ struct bkey_format test_format = { -+ .key_u64s = 3, -+ .nr_fields = BKEY_NR_FIELDS, -+ .bits_per_field = { -+ 13, -+ 64, -+ 32, -+ }, -+ }; -+ -+ struct unpack_state in_s = -+ unpack_state_init(&bch2_bkey_format_current, (void *) &t); -+ struct pack_state out_s = pack_state_init(&test_format, &p); -+ unsigned i; -+ -+ for (i = 0; i < out_s.format->nr_fields; i++) { -+ u64 a, v = get_inc_field(&in_s, i); -+ -+ switch (i) { -+#define x(id, field) case id: a = t.field; break; -+ bkey_fields() -+#undef x -+ default: -+ BUG(); -+ } -+ -+ if (a != v) -+ panic("got %llu actual %llu i %u\n", v, a, i); -+ -+ if (!set_inc_field(&out_s, i, v)) -+ panic("failed at %u\n", i); -+ } -+ -+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -+} -+#endif -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -new file mode 100644 -index 000000000..51969a462 ---- /dev/null -+++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,782 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_H -+#define _BCACHEFS_BKEY_H -+ -+#include -+#include "bcachefs_format.h" -+ -+#include "btree_types.h" -+#include "util.h" -+#include "vstructs.h" -+ -+enum bkey_invalid_flags { -+ BKEY_INVALID_WRITE = (1U << 0), -+ BKEY_INVALID_COMMIT = (1U << 1), -+ BKEY_INVALID_JOURNAL = (1U << 2), -+}; -+ -+#if 0 -+ -+/* -+ * compiled unpack functions are disabled, pending a new interface for -+ * dynamically allocating executable memory: -+ */ -+ -+#ifdef CONFIG_X86_64 -+#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -+#endif -+#endif -+ -+void bch2_bkey_packed_to_binary_text(struct printbuf *, -+ const struct bkey_format *, -+ const struct bkey_packed *); -+ -+/* bkey with split value, const */ -+struct bkey_s_c { -+ const struct bkey *k; -+ const struct bch_val *v; -+}; -+ -+/* bkey with split value */ -+struct bkey_s { -+ union { -+ struct { -+ struct bkey *k; -+ struct bch_val *v; -+ }; -+ struct bkey_s_c s_c; -+ }; -+}; -+ -+#define bkey_p_next(_k) vstruct_next(_k) -+ -+static inline struct bkey_i *bkey_next(struct bkey_i *k) -+{ -+ return (struct bkey_i *) (k->_data + k->k.u64s); -+} -+ -+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) -+ -+static inline size_t bkey_val_bytes(const struct bkey *k) -+{ -+ return bkey_val_u64s(k) * sizeof(u64); -+} -+ -+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -+{ -+ unsigned u64s = BKEY_U64s + val_u64s; -+ -+ BUG_ON(u64s > U8_MAX); -+ k->u64s = u64s; -+} -+ -+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -+{ -+ set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -+} -+ -+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) -+ -+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) -+ -+#define bkey_whiteout(_k) \ -+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) -+ -+enum bkey_lr_packed { -+ BKEY_PACKED_BOTH, -+ BKEY_PACKED_RIGHT, -+ BKEY_PACKED_LEFT, -+ BKEY_PACKED_NONE, -+}; -+ -+#define bkey_lr_packed(_l, _r) \ -+ ((_l)->format + ((_r)->format << 1)) -+ -+#define bkey_copy(_dst, _src) \ -+do { \ -+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ -+ !type_is(_dst, struct bkey_packed *)); \ -+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ -+ !type_is(_src, struct bkey_packed *)); \ -+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ -+ (u64 *) (_dst) < (u64 *) (_src) + \ -+ ((struct bkey *) (_src))->u64s); \ -+ \ -+ memcpy_u64s_small((_dst), (_src), \ -+ ((struct bkey *) (_src))->u64s); \ -+} while (0) -+ -+struct btree; -+ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+__pure -+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+__pure -+int bch2_bkey_cmp_packed(const struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+static inline __pure -+int bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, const struct bpos *r) -+{ -+ return __bch2_bkey_cmp_left_packed(b, l, r); -+} -+ -+/* -+ * The compiler generates better code when we pass bpos by ref, but it's often -+ * enough terribly convenient to pass it by val... as much as I hate c++, const -+ * ref would be nice here: -+ */ -+__pure __flatten -+static inline int bkey_cmp_left_packed_byval(const struct btree *b, -+ const struct bkey_packed *l, -+ struct bpos r) -+{ -+ return bkey_cmp_left_packed(b, l, &r); -+} -+ -+static __always_inline bool bpos_eq(struct bpos l, struct bpos r) -+{ -+ return !((l.inode ^ r.inode) | -+ (l.offset ^ r.offset) | -+ (l.snapshot ^ r.snapshot)); -+} -+ -+static __always_inline bool bpos_lt(struct bpos l, struct bpos r) -+{ -+ return l.inode != r.inode ? l.inode < r.inode : -+ l.offset != r.offset ? l.offset < r.offset : -+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; -+} -+ -+static __always_inline bool bpos_le(struct bpos l, struct bpos r) -+{ -+ return l.inode != r.inode ? l.inode < r.inode : -+ l.offset != r.offset ? l.offset < r.offset : -+ l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; -+} -+ -+static __always_inline bool bpos_gt(struct bpos l, struct bpos r) -+{ -+ return bpos_lt(r, l); -+} -+ -+static __always_inline bool bpos_ge(struct bpos l, struct bpos r) -+{ -+ return bpos_le(r, l); -+} -+ -+static __always_inline int bpos_cmp(struct bpos l, struct bpos r) -+{ -+ return cmp_int(l.inode, r.inode) ?: -+ cmp_int(l.offset, r.offset) ?: -+ cmp_int(l.snapshot, r.snapshot); -+} -+ -+static inline struct bpos bpos_min(struct bpos l, struct bpos r) -+{ -+ return bpos_lt(l, r) ? l : r; -+} -+ -+static inline struct bpos bpos_max(struct bpos l, struct bpos r) -+{ -+ return bpos_gt(l, r) ? l : r; -+} -+ -+static __always_inline bool bkey_eq(struct bpos l, struct bpos r) -+{ -+ return !((l.inode ^ r.inode) | -+ (l.offset ^ r.offset)); -+} -+ -+static __always_inline bool bkey_lt(struct bpos l, struct bpos r) -+{ -+ return l.inode != r.inode -+ ? l.inode < r.inode -+ : l.offset < r.offset; -+} -+ -+static __always_inline bool bkey_le(struct bpos l, struct bpos r) -+{ -+ return l.inode != r.inode -+ ? l.inode < r.inode -+ : l.offset <= r.offset; -+} -+ -+static __always_inline bool bkey_gt(struct bpos l, struct bpos r) -+{ -+ return bkey_lt(r, l); -+} -+ -+static __always_inline bool bkey_ge(struct bpos l, struct bpos r) -+{ -+ return bkey_le(r, l); -+} -+ -+static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -+{ -+ return cmp_int(l.inode, r.inode) ?: -+ cmp_int(l.offset, r.offset); -+} -+ -+static inline struct bpos bkey_min(struct bpos l, struct bpos r) -+{ -+ return bkey_lt(l, r) ? l : r; -+} -+ -+static inline struct bpos bkey_max(struct bpos l, struct bpos r) -+{ -+ return bkey_gt(l, r) ? l : r; -+} -+ -+void bch2_bpos_swab(struct bpos *); -+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); -+ -+static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -+{ -+ return cmp_int(l.hi, r.hi) ?: -+ cmp_int(l.lo, r.lo); -+} -+ -+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -+ -+static __always_inline int bversion_zero(struct bversion v) -+{ -+ return !bversion_cmp(v, ZERO_VERSION); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+/* statement expressions confusing unlikely()? */ -+#define bkey_packed(_k) \ -+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ -+ (_k)->format != KEY_FORMAT_CURRENT; }) -+#else -+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -+#endif -+ -+/* -+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse -+ */ -+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -+{ -+ return (struct bkey_packed *) k; -+} -+ -+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -+{ -+ return (const struct bkey_packed *) k; -+} -+ -+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (struct bkey_i *) k; -+} -+ -+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (const struct bkey *) k; -+} -+ -+static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -+{ -+ return format->bits_per_field[BKEY_FIELD_INODE] + -+ format->bits_per_field[BKEY_FIELD_OFFSET] + -+ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -+} -+ -+static inline struct bpos bpos_successor(struct bpos p) -+{ -+ if (!++p.snapshot && -+ !++p.offset && -+ !++p.inode) -+ BUG(); -+ -+ return p; -+} -+ -+static inline struct bpos bpos_predecessor(struct bpos p) -+{ -+ if (!p.snapshot-- && -+ !p.offset-- && -+ !p.inode--) -+ BUG(); -+ -+ return p; -+} -+ -+static inline struct bpos bpos_nosnap_successor(struct bpos p) -+{ -+ p.snapshot = 0; -+ -+ if (!++p.offset && -+ !++p.inode) -+ BUG(); -+ -+ return p; -+} -+ -+static inline struct bpos bpos_nosnap_predecessor(struct bpos p) -+{ -+ p.snapshot = 0; -+ -+ if (!p.offset-- && -+ !p.inode--) -+ BUG(); -+ -+ return p; -+} -+ -+static inline u64 bkey_start_offset(const struct bkey *k) -+{ -+ return k->p.offset - k->size; -+} -+ -+static inline struct bpos bkey_start_pos(const struct bkey *k) -+{ -+ return (struct bpos) { -+ .inode = k->p.inode, -+ .offset = bkey_start_offset(k), -+ .snapshot = k->p.snapshot, -+ }; -+} -+ -+/* Packed helpers */ -+ -+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; -+ -+ EBUG_ON(k->u64s < ret); -+ return ret; -+} -+ -+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_key_u64s(format, k) * sizeof(u64); -+} -+ -+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return k->u64s - bkeyp_key_u64s(format, k); -+} -+ -+static inline size_t bkeyp_val_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_val_u64s(format, k) * sizeof(u64); -+} -+ -+static inline void set_bkeyp_val_u64s(const struct bkey_format *format, -+ struct bkey_packed *k, unsigned val_u64s) -+{ -+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -+} -+ -+#define bkeyp_val(_format, _k) \ -+ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) -+ -+extern const struct bkey_format bch2_bkey_format_current; -+ -+bool bch2_bkey_transform(const struct bkey_format *, -+ struct bkey_packed *, -+ const struct bkey_format *, -+ const struct bkey_packed *); -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *, -+ const struct bkey_packed *); -+#endif -+ -+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, -+ const struct bkey_format *); -+ -+enum bkey_pack_pos_ret { -+ BKEY_PACK_POS_EXACT, -+ BKEY_PACK_POS_SMALLER, -+ BKEY_PACK_POS_FAIL, -+}; -+ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, -+ const struct btree *); -+ -+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, -+ const struct btree *b) -+{ -+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -+} -+ -+void bch2_bkey_unpack(const struct btree *, struct bkey_i *, -+ const struct bkey_packed *); -+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, -+ const struct bkey_format *); -+ -+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -+ -+static inline void -+__bkey_unpack_key_format_checked(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { -+ compiled_unpack_fn unpack_fn = b->aux_data; -+ unpack_fn(dst, src); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -+ bch2_expensive_debug_checks) { -+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); -+ -+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -+ } -+ } else { -+ *dst = __bch2_bkey_unpack_key(&b->format, src); -+ } -+} -+ -+static inline struct bkey -+bkey_unpack_key_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ struct bkey dst; -+ -+ __bkey_unpack_key_format_checked(b, &dst, src); -+ return dst; -+} -+ -+static inline void __bkey_unpack_key(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (likely(bkey_packed(src))) -+ __bkey_unpack_key_format_checked(b, dst, src); -+ else -+ *dst = *packed_to_bkey_c(src); -+} -+ -+/** -+ * bkey_unpack_key -- unpack just the key, not the value -+ */ -+static inline struct bkey bkey_unpack_key(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_key_format_checked(b, src) -+ : *packed_to_bkey_c(src); -+} -+ -+static inline struct bpos -+bkey_unpack_pos_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ return bkey_unpack_key_format_checked(b, src).p; -+#else -+ return __bkey_unpack_pos(&b->format, src); -+#endif -+} -+ -+static inline struct bpos bkey_unpack_pos(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_pos_format_checked(b, src) -+ : packed_to_bkey_c(src)->p; -+} -+ -+/* Disassembled bkeys */ -+ -+static inline struct bkey_s_c bkey_disassemble(const struct btree *b, -+ const struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -+} -+ -+/* non const version: */ -+static inline struct bkey_s __bkey_disassemble(const struct btree *b, -+ struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -+} -+ -+static inline u64 bkey_field_max(const struct bkey_format *f, -+ enum bch_bkey_fields nr) -+{ -+ return f->bits_per_field[nr] < 64 -+ ? (le64_to_cpu(f->field_offset[nr]) + -+ ~(~0ULL << f->bits_per_field[nr])) -+ : U64_MAX; -+} -+ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ -+int bch2_compile_bkey_format(const struct bkey_format *, void *); -+ -+#else -+ -+static inline int bch2_compile_bkey_format(const struct bkey_format *format, -+ void *out) { return 0; } -+ -+#endif -+ -+static inline void bkey_reassemble(struct bkey_i *dst, -+ struct bkey_s_c src) -+{ -+ dst->k = *src.k; -+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -+} -+ -+#define bkey_s_null ((struct bkey_s) { .k = NULL }) -+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) -+ -+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) -+ -+static inline struct bkey_s bkey_to_s(struct bkey *k) -+{ -+ return (struct bkey_s) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -+{ -+ return (struct bkey_s_c) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -+{ -+ return (struct bkey_s) { .k = &k->k, .v = &k->v }; -+} -+ -+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -+{ -+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -+} -+ -+/* -+ * For a given type of value (e.g. struct bch_extent), generates the types for -+ * bkey + bch_extent - inline, split, split const - and also all the conversion -+ * functions, which also check that the value is of the correct type. -+ * -+ * We use anonymous unions for upcasting - e.g. converting from e.g. a -+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion -+ * functions. -+ */ -+#define x(name, ...) \ -+struct bkey_i_##name { \ -+ union { \ -+ struct bkey k; \ -+ struct bkey_i k_i; \ -+ }; \ -+ struct bch_##name v; \ -+}; \ -+ \ -+struct bkey_s_c_##name { \ -+ union { \ -+ struct { \ -+ const struct bkey *k; \ -+ const struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+struct bkey_s_##name { \ -+ union { \ -+ struct { \ -+ struct bkey *k; \ -+ struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c_##name c; \ -+ struct bkey_s s; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline const struct bkey_i_##name * \ -+bkey_i_to_##name##_c(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -+{ \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+name##_i_to_s_c(const struct bkey_i_##name *k) \ -+{ \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+bkey_i_to_s_c_##name(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -+{ \ -+ struct bkey_i_##name *k = \ -+ container_of(&_k->k, struct bkey_i_##name, k); \ -+ \ -+ bkey_init(&k->k); \ -+ memset(&k->v, 0, sizeof(k->v)); \ -+ k->k.type = KEY_TYPE_##name; \ -+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ -+ \ -+ return k; \ -+} -+ -+BCH_BKEY_TYPES(); -+#undef x -+ -+/* byte order helpers */ -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return f->key_u64s - 1; -+} -+ -+#define high_bit_offset 0 -+#define nth_word(p, n) ((p) - (n)) -+ -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return 0; -+} -+ -+#define high_bit_offset KEY_PACKED_BITS_START -+#define nth_word(p, n) ((p) + (n)) -+ -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define high_word(f, k) ((k)->_data + high_word_offset(f)) -+#define next_word(p) nth_word(p, 1) -+#define prev_word(p) nth_word(p, -1) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void); -+#else -+static inline void bch2_bkey_pack_test(void) {} -+#endif -+ -+#define bkey_fields() \ -+ x(BKEY_FIELD_INODE, p.inode) \ -+ x(BKEY_FIELD_OFFSET, p.offset) \ -+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ -+ x(BKEY_FIELD_SIZE, size) \ -+ x(BKEY_FIELD_VERSION_HI, version.hi) \ -+ x(BKEY_FIELD_VERSION_LO, version.lo) -+ -+struct bkey_format_state { -+ u64 field_min[BKEY_NR_FIELDS]; -+ u64 field_max[BKEY_NR_FIELDS]; -+}; -+ -+void bch2_bkey_format_init(struct bkey_format_state *); -+ -+static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) -+{ -+ s->field_min[field] = min(s->field_min[field], v); -+ s->field_max[field] = max(s->field_max[field], v); -+} -+ -+/* -+ * Changes @format so that @k can be successfully packed with @format -+ */ -+static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -+{ -+#define x(id, field) __bkey_format_add(s, id, k->field); -+ bkey_fields() -+#undef x -+} -+ -+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -+int bch2_bkey_format_invalid(struct bch_fs *, struct bkey_format *, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_bkey_format_to_text(struct printbuf *, const struct bkey_format *); -+ -+#endif /* _BCACHEFS_BKEY_H */ -diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h -new file mode 100644 -index 000000000..a30c4ae8e ---- /dev/null -+++ b/fs/bcachefs/bkey_buf.h -@@ -0,0 +1,61 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_BUF_H -+#define _BCACHEFS_BKEY_BUF_H -+ -+#include "bcachefs.h" -+#include "bkey.h" -+ -+struct bkey_buf { -+ struct bkey_i *k; -+ u64 onstack[12]; -+}; -+ -+static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, -+ struct bch_fs *c, unsigned u64s) -+{ -+ if (s->k == (void *) s->onstack && -+ u64s > ARRAY_SIZE(s->onstack)) { -+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); -+ memcpy(s->k, s->onstack, sizeof(s->onstack)); -+ } -+} -+ -+static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, -+ struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_buf_realloc(s, c, k.k->u64s); -+ bkey_reassemble(s->k, k); -+} -+ -+static inline void bch2_bkey_buf_copy(struct bkey_buf *s, -+ struct bch_fs *c, -+ struct bkey_i *src) -+{ -+ bch2_bkey_buf_realloc(s, c, src->k.u64s); -+ bkey_copy(s->k, src); -+} -+ -+static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, -+ struct bch_fs *c, -+ struct btree *b, -+ struct bkey_packed *src) -+{ -+ bch2_bkey_buf_realloc(s, c, BKEY_U64s + -+ bkeyp_val_u64s(&b->format, src)); -+ bch2_bkey_unpack(b, s->k, src); -+} -+ -+static inline void bch2_bkey_buf_init(struct bkey_buf *s) -+{ -+ s->k = (void *) s->onstack; -+} -+ -+static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) -+{ -+ if (s->k != (void *) s->onstack) -+ mempool_free(s->k, &c->large_bkey_pool); -+ s->k = NULL; -+} -+ -+#endif /* _BCACHEFS_BKEY_BUF_H */ -diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h -new file mode 100644 -index 000000000..5f42a6e69 ---- /dev/null -+++ b/fs/bcachefs/bkey_cmp.h -@@ -0,0 +1,129 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_CMP_H -+#define _BCACHEFS_BKEY_CMP_H -+ -+#include "bkey.h" -+ -+#ifdef CONFIG_X86_64 -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ long d0, d1, d2, d3; -+ int cmp; -+ -+ /* we shouldn't need asm for this, but gcc is being retarded: */ -+ -+ asm(".intel_syntax noprefix;" -+ "xor eax, eax;" -+ "xor edx, edx;" -+ "1:;" -+ "mov r8, [rdi];" -+ "mov r9, [rsi];" -+ "sub ecx, 64;" -+ "jl 2f;" -+ -+ "cmp r8, r9;" -+ "jnz 3f;" -+ -+ "lea rdi, [rdi - 8];" -+ "lea rsi, [rsi - 8];" -+ "jmp 1b;" -+ -+ "2:;" -+ "not ecx;" -+ "shr r8, 1;" -+ "shr r9, 1;" -+ "shr r8, cl;" -+ "shr r9, cl;" -+ "cmp r8, r9;" -+ -+ "3:\n" -+ "seta al;" -+ "setb dl;" -+ "sub eax, edx;" -+ ".att_syntax prefix;" -+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) -+ : "0" (l), "1" (r), "3" (nr_key_bits) -+ : "r8", "r9", "cc", "memory"); -+ -+ return cmp; -+} -+#else -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ u64 l_v, r_v; -+ -+ if (!nr_key_bits) -+ return 0; -+ -+ /* for big endian, skip past header */ -+ nr_key_bits += high_bit_offset; -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (1) { -+ if (nr_key_bits < 64) { -+ l_v >>= 64 - nr_key_bits; -+ r_v >>= 64 - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= 64; -+ } -+ -+ if (!nr_key_bits || l_v != r_v) -+ break; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ } -+ -+ return cmp_int(l_v, r_v); -+} -+#endif -+ -+static inline __pure __flatten -+int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ int ret; -+ -+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ ret = __bkey_cmp_bits(high_word(f, l), -+ high_word(f, r), -+ b->nr_key_bits); -+ -+ EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), -+ bkey_unpack_pos(b, r))); -+ return ret; -+} -+ -+static inline __pure __flatten -+int bch2_bkey_cmp_packed_inlined(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ struct bkey unpacked; -+ -+ if (likely(bkey_packed(l) && bkey_packed(r))) -+ return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); -+ -+ if (bkey_packed(l)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, l); -+ l = (void *) &unpacked; -+ } else if (bkey_packed(r)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, r); -+ r = (void *) &unpacked; -+ } -+ -+ return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -+} -+ -+#endif /* _BCACHEFS_BKEY_CMP_H */ -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -new file mode 100644 -index 000000000..6547142db ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,456 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "backpointers.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "alloc_background.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "lru.h" -+#include "quota.h" -+#include "reflink.h" -+#include "snapshot.h" -+#include "subvolume.h" -+#include "xattr.h" -+ -+const char * const bch2_bkey_types[] = { -+#define x(name, nr) #name, -+ BCH_BKEY_TYPES() -+#undef x -+ NULL -+}; -+ -+static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ return 0; -+} -+ -+#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+}) -+ -+#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+}) -+ -+static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ if (bkey_val_bytes(k.k)) { -+ prt_printf(err, "incorrect value size (%zu != 0)", -+ bkey_val_bytes(k.k)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+#define bch2_bkey_ops_error ((struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+}) -+ -+static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ return 0; -+} -+ -+#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ -+ .key_invalid = key_type_cookie_invalid, \ -+ .min_val_size = 8, \ -+}) -+ -+#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ -+ .key_invalid = empty_val_key_invalid, \ -+}) -+ -+static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ return 0; -+} -+ -+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); -+ unsigned datalen = bkey_inline_data_bytes(k.k); -+ -+ prt_printf(out, "datalen %u: %*phN", -+ datalen, min(datalen, 32U), d.v->data); -+} -+ -+#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ -+ .key_invalid = key_type_inline_data_invalid, \ -+ .val_to_text = key_type_inline_data_to_text, \ -+}) -+ -+static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ if (bkey_val_bytes(k.k)) { -+ prt_printf(err, "incorrect value size (%zu != %zu)", -+ bkey_val_bytes(k.k), sizeof(struct bch_cookie)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -+{ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ return true; -+} -+ -+#define bch2_bkey_ops_set ((struct bkey_ops) { \ -+ .key_invalid = key_type_set_invalid, \ -+ .key_merge = key_type_set_merge, \ -+}) -+ -+const struct bkey_ops bch2_bkey_ops[] = { -+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, -+ BCH_BKEY_TYPES() -+#undef x -+}; -+ -+const struct bkey_ops bch2_bkey_null_ops = { -+ .min_val_size = U8_MAX, -+}; -+ -+int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); -+ -+ if (bkey_val_bytes(k.k) < ops->min_val_size) { -+ prt_printf(err, "bad val size (%zu < %u)", -+ bkey_val_bytes(k.k), ops->min_val_size); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (!ops->key_invalid) -+ return 0; -+ -+ return ops->key_invalid(c, k, flags, err); -+} -+ -+static u64 bch2_key_types_allowed[] = { -+#define x(name, nr, flags, keys) [BKEY_TYPE_##name] = BIT_ULL(KEY_TYPE_deleted)|keys, -+ BCH_BTREE_IDS() -+#undef x -+ [BKEY_TYPE_btree] = -+ BIT_ULL(KEY_TYPE_deleted)| -+ BIT_ULL(KEY_TYPE_btree_ptr)| -+ BIT_ULL(KEY_TYPE_btree_ptr_v2), -+}; -+ -+int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (k.k->u64s < BKEY_U64s) { -+ prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (flags & BKEY_INVALID_COMMIT && -+ !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type))) { -+ prt_printf(err, "invalid key type for btree %s (%s)", -+ bch2_btree_ids[type], bch2_bkey_types[k.k->type]); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { -+ if (k.k->size == 0) { -+ prt_printf(err, "size == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (k.k->size > k.k->p.offset) { -+ prt_printf(err, "size greater than offset (%u > %llu)", -+ k.k->size, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } else { -+ if (k.k->size) { -+ prt_printf(err, "size != 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ -+ if (type != BKEY_TYPE_btree) { -+ if (!btree_type_has_snapshots((enum btree_id) type) && -+ k.k->p.snapshot) { -+ prt_printf(err, "nonzero snapshot"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (btree_type_has_snapshots((enum btree_id) type) && -+ !k.k->p.snapshot) { -+ prt_printf(err, "snapshot == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bkey_eq(k.k->p, POS_MAX)) { -+ prt_printf(err, "key at POS_MAX"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ -+ return 0; -+} -+ -+int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ return __bch2_bkey_invalid(c, k, type, flags, err) ?: -+ bch2_bkey_val_invalid(c, k, flags, err); -+} -+ -+int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, -+ struct printbuf *err) -+{ -+ if (bpos_lt(k.k->p, b->data->min_key)) { -+ prt_printf(err, "key before start of btree node"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bpos_gt(k.k->p, b->data->max_key)) { -+ prt_printf(err, "key past end of btree node"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -+{ -+ if (bpos_eq(pos, POS_MIN)) -+ prt_printf(out, "POS_MIN"); -+ else if (bpos_eq(pos, POS_MAX)) -+ prt_printf(out, "POS_MAX"); -+ else if (bpos_eq(pos, SPOS_MAX)) -+ prt_printf(out, "SPOS_MAX"); -+ else { -+ if (pos.inode == U64_MAX) -+ prt_printf(out, "U64_MAX"); -+ else -+ prt_printf(out, "%llu", pos.inode); -+ prt_printf(out, ":"); -+ if (pos.offset == U64_MAX) -+ prt_printf(out, "U64_MAX"); -+ else -+ prt_printf(out, "%llu", pos.offset); -+ prt_printf(out, ":"); -+ if (pos.snapshot == U32_MAX) -+ prt_printf(out, "U32_MAX"); -+ else -+ prt_printf(out, "%u", pos.snapshot); -+ } -+} -+ -+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -+{ -+ if (k) { -+ prt_printf(out, "u64s %u type ", k->u64s); -+ -+ if (k->type < KEY_TYPE_MAX) -+ prt_printf(out, "%s ", bch2_bkey_types[k->type]); -+ else -+ prt_printf(out, "%u ", k->type); -+ -+ bch2_bpos_to_text(out, k->p); -+ -+ prt_printf(out, " len %u ver %llu", k->size, k->version.lo); -+ } else { -+ prt_printf(out, "(null)"); -+ } -+} -+ -+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); -+ -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); -+} -+ -+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_to_text(out, k.k); -+ -+ if (bkey_val_bytes(k.k)) { -+ prt_printf(out, ": "); -+ bch2_val_to_text(out, c, k); -+ } -+} -+ -+void bch2_bkey_swab_val(struct bkey_s k) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); -+ -+ if (ops->swab) -+ ops->swab(k); -+} -+ -+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); -+ -+ return ops->key_normalize -+ ? ops->key_normalize(c, k) -+ : false; -+} -+ -+bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); -+ -+ return ops->key_merge && -+ bch2_bkey_maybe_mergable(l.k, r.k) && -+ (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && -+ !bch2_key_merging_disabled && -+ ops->key_merge(c, l, r); -+} -+ -+static const struct old_bkey_type { -+ u8 btree_node_type; -+ u8 old; -+ u8 new; -+} bkey_renumber_table[] = { -+ {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, -+ {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, -+ {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, -+ {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, -+ {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, -+ {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, -+ {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, -+ {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, -+ {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, -+ {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, -+ {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, -+ {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, -+}; -+ -+void bch2_bkey_renumber(enum btree_node_type btree_node_type, -+ struct bkey_packed *k, -+ int write) -+{ -+ const struct old_bkey_type *i; -+ -+ for (i = bkey_renumber_table; -+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); -+ i++) -+ if (btree_node_type == i->btree_node_type && -+ k->type == (write ? i->new : i->old)) { -+ k->type = write ? i->old : i->new; -+ break; -+ } -+} -+ -+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ const struct bkey_ops *ops; -+ struct bkey uk; -+ struct bkey_s u; -+ unsigned nr_compat = 5; -+ int i; -+ -+ /* -+ * Do these operations in reverse order in the write path: -+ */ -+ -+ for (i = 0; i < nr_compat; i++) -+ switch (!write ? i : nr_compat - 1 - i) { -+ case 0: -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_key(f, k); -+ break; -+ case 1: -+ if (version < bcachefs_metadata_version_bkey_renumber) -+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); -+ break; -+ case 2: -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_inodes) { -+ if (!bkey_packed(k)) { -+ struct bkey_i *u = packed_to_bkey(k); -+ -+ swap(u->k.p.inode, u->k.p.offset); -+ } else if (f->bits_per_field[BKEY_FIELD_INODE] && -+ f->bits_per_field[BKEY_FIELD_OFFSET]) { -+ struct bkey_format tmp = *f, *in = f, *out = &tmp; -+ -+ swap(tmp.bits_per_field[BKEY_FIELD_INODE], -+ tmp.bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(tmp.field_offset[BKEY_FIELD_INODE], -+ tmp.field_offset[BKEY_FIELD_OFFSET]); -+ -+ if (!write) -+ swap(in, out); -+ -+ uk = __bch2_bkey_unpack_key(in, k); -+ swap(uk.p.inode, uk.p.offset); -+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); -+ } -+ } -+ break; -+ case 3: -+ if (version < bcachefs_metadata_version_snapshot && -+ (level || btree_type_has_snapshots(btree_id))) { -+ struct bkey_i *u = packed_to_bkey(k); -+ -+ if (u) { -+ u->k.p.snapshot = write -+ ? 0 : U32_MAX; -+ } else { -+ u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); -+ u64 max_packed = min_packed + -+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); -+ -+ uk = __bch2_bkey_unpack_key(f, k); -+ uk.p.snapshot = write -+ ? min_packed : min_t(u64, U32_MAX, max_packed); -+ -+ BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); -+ } -+ } -+ -+ break; -+ case 4: -+ if (!bkey_packed(k)) { -+ u = bkey_i_to_s(packed_to_bkey(k)); -+ } else { -+ uk = __bch2_bkey_unpack_key(f, k); -+ u.k = &uk; -+ u.v = bkeyp_val(f, k); -+ } -+ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_val(u); -+ -+ ops = bch2_bkey_type_ops(k->type); -+ -+ if (ops->compat) -+ ops->compat(btree_id, version, big_endian, write, u); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h -new file mode 100644 -index 000000000..668f595e2 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,188 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_METHODS_H -+#define _BCACHEFS_BKEY_METHODS_H -+ -+#include "bkey.h" -+ -+struct bch_fs; -+struct btree; -+struct btree_trans; -+struct bkey; -+enum btree_node_type; -+ -+extern const char * const bch2_bkey_types[]; -+extern const struct bkey_ops bch2_bkey_null_ops; -+ -+/* -+ * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If -+ * invalid, entire key will be deleted. -+ * -+ * When invalid, error string is returned via @err. @rw indicates whether key is -+ * being read or written; more aggressive checks can be enabled when rw == WRITE. -+ */ -+struct bkey_ops { -+ int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, struct printbuf *err); -+ void (*val_to_text)(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ void (*swab)(struct bkey_s); -+ bool (*key_normalize)(struct bch_fs *, struct bkey_s); -+ bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); -+ int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_i *, unsigned); -+ int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+ void (*compat)(enum btree_id id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s); -+ -+ /* Size of value type when first created: */ -+ unsigned min_val_size; -+}; -+ -+extern const struct bkey_ops bch2_bkey_ops[]; -+ -+static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) -+{ -+ return likely(type < KEY_TYPE_MAX) -+ ? &bch2_bkey_ops[type] -+ : &bch2_bkey_null_ops; -+} -+ -+int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, enum btree_node_type, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); -+ -+void bch2_bpos_to_text(struct printbuf *, struct bpos); -+void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -+void bch2_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_bkey_swab_val(struct bkey_s); -+ -+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -+ -+static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) -+{ -+ return l->type == r->type && -+ !bversion_cmp(l->version, r->version) && -+ bpos_eq(l->p, bkey_start_pos(r)); -+} -+ -+bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -+ -+static inline int bch2_mark_key(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); -+ -+ return ops->atomic_trigger -+ ? ops->atomic_trigger(trans, btree, level, old, new, flags) -+ : 0; -+} -+ -+enum btree_update_flags { -+ __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, -+ __BTREE_UPDATE_NOJOURNAL, -+ __BTREE_UPDATE_PREJOURNAL, -+ __BTREE_UPDATE_KEY_CACHE_RECLAIM, -+ -+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ -+ -+ __BTREE_TRIGGER_INSERT, -+ __BTREE_TRIGGER_OVERWRITE, -+ -+ __BTREE_TRIGGER_GC, -+ __BTREE_TRIGGER_BUCKET_INVALIDATE, -+ __BTREE_TRIGGER_NOATOMIC, -+}; -+ -+#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) -+#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) -+#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL) -+#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) -+ -+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -+ -+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -+ -+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -+ -+#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ -+ ((1U << KEY_TYPE_alloc)| \ -+ (1U << KEY_TYPE_alloc_v2)| \ -+ (1U << KEY_TYPE_alloc_v3)| \ -+ (1U << KEY_TYPE_alloc_v4)| \ -+ (1U << KEY_TYPE_stripe)| \ -+ (1U << KEY_TYPE_inode)| \ -+ (1U << KEY_TYPE_inode_v2)| \ -+ (1U << KEY_TYPE_snapshot)) -+ -+static inline int bch2_trans_mark_key(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); -+ -+ return ops->trans_trigger -+ ? ops->trans_trigger(trans, btree_id, level, old, new, flags) -+ : 0; -+} -+ -+static inline int bch2_trans_mark_old(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, unsigned flags) -+{ -+ struct bkey_i deleted; -+ -+ bkey_init(&deleted.k); -+ deleted.k.p = old.k->p; -+ -+ return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, -+ BTREE_TRIGGER_OVERWRITE|flags); -+} -+ -+static inline int bch2_trans_mark_new(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_i *new, unsigned flags) -+{ -+ struct bkey_i deleted; -+ -+ bkey_init(&deleted.k); -+ deleted.k.p = new->k.p; -+ -+ return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, -+ BTREE_TRIGGER_INSERT|flags); -+} -+ -+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); -+ -+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, -+ int, struct bkey_format *, struct bkey_packed *); -+ -+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ if (version < bcachefs_metadata_version_current || -+ big_endian != CPU_BIG_ENDIAN) -+ __bch2_bkey_compat(level, btree_id, version, -+ big_endian, write, f, k); -+ -+} -+ -+#endif /* _BCACHEFS_BKEY_METHODS_H */ -diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c -new file mode 100644 -index 000000000..b9aa027c8 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,201 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "bkey_cmp.h" -+#include "bkey_sort.h" -+#include "bset.h" -+#include "extents.h" -+ -+typedef int (*sort_cmp_fn)(struct btree *, -+ struct bkey_packed *, -+ struct bkey_packed *); -+ -+static inline bool sort_iter_end(struct sort_iter *iter) -+{ -+ return !iter->used; -+} -+ -+static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, -+ sort_cmp_fn cmp) -+{ -+ unsigned i; -+ -+ for (i = from; -+ i + 1 < iter->used && -+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; -+ i++) -+ swap(iter->data[i], iter->data[i + 1]); -+} -+ -+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ unsigned i = iter->used; -+ -+ while (i--) -+ sort_iter_sift(iter, i, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -+{ -+ return !sort_iter_end(iter) ? iter->data->k : NULL; -+} -+ -+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ struct sort_iter_set *i = iter->data; -+ -+ BUG_ON(!iter->used); -+ -+ i->k = bkey_p_next(i->k); -+ -+ BUG_ON(i->k > i->end); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, 0); -+ else -+ sort_iter_sift(iter, 0, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, -+ sort_cmp_fn cmp) -+{ -+ struct bkey_packed *ret = sort_iter_peek(iter); -+ -+ if (ret) -+ sort_iter_advance(iter, cmp); -+ -+ return ret; -+} -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ */ -+static inline int key_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bch2_bkey_cmp_packed(b, l, r) ?: -+ cmp_int((unsigned long) l, (unsigned long) r); -+} -+ -+static inline bool should_drop_next_key(struct sort_iter *iter) -+{ -+ /* -+ * key_sort_cmp() ensures that when keys compare equal the older key -+ * comes first; so if l->k compares equal to r->k then l->k is older -+ * and should be dropped. -+ */ -+ return iter->used >= 2 && -+ !bch2_bkey_cmp_packed(iter->b, -+ iter->data[0].k, -+ iter->data[1].k); -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct bkey_packed *out = dst->start; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); -+ -+ while ((k = sort_iter_peek(iter))) { -+ if (!bkey_deleted(k) && -+ !should_drop_next_key(iter)) { -+ bkey_copy(out, k); -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_p_next(out); -+ } -+ -+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+/* Sort + repack in a new format: */ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *dst, struct btree *src, -+ struct btree_node_iter *src_iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_format *in_f = &src->format; -+ struct bkey_packed *in, *out = vstruct_last(dst); -+ struct btree_nr_keys nr; -+ bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { -+ if (filter_whiteouts && bkey_deleted(in)) -+ continue; -+ -+ if (!transform) -+ bkey_copy(out, in); -+ else if (bch2_bkey_transform(out_f, out, bkey_packed(in) -+ ? in_f : &bch2_bkey_format_current, in)) -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ else -+ bch2_bkey_unpack(src, (void *) out, in); -+ -+ out->needs_whiteout = false; -+ -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_p_next(out); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+static inline int sort_keys_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bch2_bkey_cmp_packed_inlined(b, l, r) ?: -+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: -+ (int) l->needs_whiteout - (int) r->needs_whiteout; -+} -+ -+unsigned bch2_sort_keys(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *next, *out = dst; -+ -+ sort_iter_sort(iter, sort_keys_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_keys_cmp))) { -+ bool needs_whiteout = false; -+ -+ if (bkey_deleted(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ while ((next = sort_iter_peek(iter)) && -+ !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { -+ BUG_ON(in->needs_whiteout && -+ next->needs_whiteout); -+ needs_whiteout |= in->needs_whiteout; -+ in = sort_iter_next(iter, sort_keys_cmp); -+ } -+ -+ if (bkey_deleted(in)) { -+ memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); -+ set_bkeyp_val_u64s(f, out, 0); -+ } else { -+ bkey_copy(out, in); -+ } -+ out->needs_whiteout |= needs_whiteout; -+ out = bkey_p_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h -new file mode 100644 -index 000000000..79cf11d1b ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_SORT_H -+#define _BCACHEFS_BKEY_SORT_H -+ -+struct sort_iter { -+ struct btree *b; -+ unsigned used; -+ unsigned size; -+ -+ struct sort_iter_set { -+ struct bkey_packed *k, *end; -+ } data[MAX_BSETS + 1]; -+}; -+ -+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) -+{ -+ iter->b = b; -+ iter->used = 0; -+ iter->size = ARRAY_SIZE(iter->data); -+} -+ -+static inline void sort_iter_add(struct sort_iter *iter, -+ struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ BUG_ON(iter->used >= iter->size); -+ -+ if (k != end) -+ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+ -+unsigned bch2_sort_keys(struct bkey_packed *, -+ struct sort_iter *, bool); -+ -+#endif /* _BCACHEFS_BKEY_SORT_H */ -diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c -new file mode 100644 -index 000000000..bcdf28f39 ---- /dev/null -+++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1587 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for working with individual keys, and sorted sets of keys with in a -+ * btree node -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "bset.h" -+#include "eytzinger.h" -+#include "trace.h" -+#include "util.h" -+ -+#include -+#include -+#include -+#include -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, -+ struct btree *); -+ -+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -+{ -+ unsigned n = ARRAY_SIZE(iter->data); -+ -+ while (n && __btree_node_iter_set_end(iter, n - 1)) -+ --n; -+ -+ return n; -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -+{ -+ return bch2_bkey_to_bset_inlined(b, k); -+} -+ -+/* -+ * There are never duplicate live keys in the btree - but including keys that -+ * have been flagged as deleted (and will be cleaned up later) we _will_ see -+ * duplicates. -+ * -+ * Thus the sort order is: usual key comparison first, but for keys that compare -+ * equal the deleted key(s) come first, and the (at most one) live version comes -+ * last. -+ * -+ * The main reason for this is insertion: to handle overwrites, we first iterate -+ * over keys that compare equal to our insert key, and then insert immediately -+ * prior to the first key greater than the key we're inserting - our insert -+ * position will be after all keys that compare equal to our insert key, which -+ * by the time we actually do the insert will all be deleted. -+ */ -+ -+void bch2_dump_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned set) -+{ -+ struct bkey_packed *_k, *_n; -+ struct bkey uk, n; -+ struct bkey_s_c k; -+ struct printbuf buf = PRINTBUF; -+ -+ if (!i->u64s) -+ return; -+ -+ for (_k = i->start; -+ _k < vstruct_last(i); -+ _k = _n) { -+ _n = bkey_p_next(_k); -+ -+ k = bkey_disassemble(b, _k, &uk); -+ -+ printbuf_reset(&buf); -+ if (c) -+ bch2_bkey_val_to_text(&buf, c, k); -+ else -+ bch2_bkey_to_text(&buf, k.k); -+ printk(KERN_ERR "block %u key %5zu: %s\n", set, -+ _k->_data - i->_data, buf.buf); -+ -+ if (_n == vstruct_last(i)) -+ continue; -+ -+ n = bkey_unpack_key(b, _n); -+ -+ if (bpos_lt(n.p, k.k->p)) { -+ printk(KERN_ERR "Key skipped backwards\n"); -+ continue; -+ } -+ -+ if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) -+ printk(KERN_ERR "Duplicate keys\n"); -+ } -+ -+ printbuf_exit(&buf); -+} -+ -+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ console_lock(); -+ for_each_bset(b, t) -+ bch2_dump_bset(c, b, bset(b, t), t - b->set); -+ console_unlock(); -+} -+ -+void bch2_dump_btree_node_iter(struct btree *b, -+ struct btree_node_iter *iter) -+{ -+ struct btree_node_iter_set *set; -+ struct printbuf buf = PRINTBUF; -+ -+ printk(KERN_ERR "btree node iter with %u/%u sets:\n", -+ __btree_node_iter_used(iter), b->nsets); -+ -+ btree_node_iter_for_each(iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk = bkey_unpack_key(b, k); -+ -+ printbuf_reset(&buf); -+ bch2_bkey_to_text(&buf, &uk); -+ printk(KERN_ERR "set %zu key %u: %s\n", -+ t - b->set, set->k, buf.buf); -+ } -+ -+ printbuf_exit(&buf); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ struct bset_tree *t; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr = { 0 }; -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_deleted(k)) -+ btree_keys_account_key_add(&nr, t - b->set, k); -+ -+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -+} -+ -+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, -+ struct btree *b) -+{ -+ struct btree_node_iter iter = *_iter; -+ const struct bkey_packed *k, *n; -+ -+ k = bch2_btree_node_iter_peek_all(&iter, b); -+ __bch2_btree_node_iter_advance(&iter, b); -+ n = bch2_btree_node_iter_peek_all(&iter, b); -+ -+ bkey_unpack_key(b, k); -+ -+ if (n && -+ bkey_iter_cmp(b, k, n) > 0) { -+ struct btree_node_iter_set *set; -+ struct bkey ku = bkey_unpack_key(b, k); -+ struct bkey nu = bkey_unpack_key(b, n); -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&buf1, &ku); -+ bch2_bkey_to_text(&buf2, &nu); -+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", -+ buf1.buf, buf2.buf); -+ printk(KERN_ERR "iter was:"); -+ -+ btree_node_iter_for_each(_iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ printk(" [%zi %zi]", t - b->set, -+ k->_data - bset(b, t)->_data); -+ } -+ panic("\n"); -+ } -+} -+ -+void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct btree_node_iter_set *set, *s2; -+ struct bkey_packed *k, *p; -+ struct bset_tree *t; -+ -+ if (bch2_btree_node_iter_end(iter)) -+ return; -+ -+ /* Verify no duplicates: */ -+ btree_node_iter_for_each(iter, set) { -+ BUG_ON(set->k > set->end); -+ btree_node_iter_for_each(iter, s2) -+ BUG_ON(set != s2 && set->end == s2->end); -+ } -+ -+ /* Verify that set->end is correct: */ -+ btree_node_iter_for_each(iter, set) { -+ for_each_bset(b, t) -+ if (set->end == t->end_offset) -+ goto found; -+ BUG(); -+found: -+ BUG_ON(set->k < btree_bkey_first_offset(t) || -+ set->k >= t->end_offset); -+ } -+ -+ /* Verify iterator is sorted: */ -+ btree_node_iter_for_each(iter, set) -+ BUG_ON(set != iter->data && -+ btree_node_iter_cmp(b, set[-1], set[0]) > 0); -+ -+ k = bch2_btree_node_iter_peek_all(iter, b); -+ -+ for_each_bset(b, t) { -+ if (iter->data[0].end == t->end_offset) -+ continue; -+ -+ p = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ -+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); -+ } -+} -+ -+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -+ struct bkey_packed *insert, unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); -+ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+#if 0 -+ BUG_ON(prev && -+ bkey_iter_cmp(b, prev, insert) > 0); -+#else -+ if (prev && -+ bkey_iter_cmp(b, prev, insert) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, prev); -+ struct bkey k2 = bkey_unpack_key(b, insert); -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&buf1, &k1); -+ bch2_bkey_to_text(&buf2, &k2); -+ -+ panic("prev > insert:\n" -+ "prev key %s\n" -+ "insert key %s\n", -+ buf1.buf, buf2.buf); -+ } -+#endif -+#if 0 -+ BUG_ON(next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0); -+#else -+ if (next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, insert); -+ struct bkey k2 = bkey_unpack_key(b, next); -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&buf1, &k1); -+ bch2_bkey_to_text(&buf2, &k2); -+ -+ panic("insert > next:\n" -+ "insert key %s\n" -+ "next key %s\n", -+ buf1.buf, buf2.buf); -+ } -+#endif -+} -+ -+#else -+ -+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, -+ struct btree *b) {} -+ -+#endif -+ -+/* Auxiliary search trees */ -+ -+#define BFLOAT_FAILED_UNPACKED U8_MAX -+#define BFLOAT_FAILED U8_MAX -+ -+struct bkey_float { -+ u8 exponent; -+ u8 key_offset; -+ u16 mantissa; -+}; -+#define BKEY_MANTISSA_BITS 16 -+ -+static unsigned bkey_float_byte_offset(unsigned idx) -+{ -+ return idx * sizeof(struct bkey_float); -+} -+ -+struct ro_aux_tree { -+ struct bkey_float f[0]; -+}; -+ -+struct rw_aux_tree { -+ u16 offset; -+ struct bpos k; -+}; -+ -+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -+{ -+ BUG_ON(t->aux_data_offset == U16_MAX); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return t->aux_data_offset; -+ case BSET_RO_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + -+ t->size * sizeof(u8), 8); -+ case BSET_RW_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned bset_aux_tree_buf_start(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return t == b->set -+ ? DIV_ROUND_UP(b->unpack_fn_len, 8) -+ : bset_aux_tree_buf_end(t - 1); -+} -+ -+static void *__aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return b->aux_data + t->aux_data_offset * 8; -+} -+ -+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+static u8 *ro_aux_tree_prev(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -+} -+ -+static struct bkey_float *bkey_float(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned idx) -+{ -+ return ro_aux_tree_base(b, t)->f + idx; -+} -+ -+static void bset_aux_tree_verify(const struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ const struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ if (t->aux_data_offset == U16_MAX) -+ continue; -+ -+ BUG_ON(t != b->set && -+ t[-1].aux_data_offset == U16_MAX); -+ -+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); -+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); -+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); -+ } -+#endif -+} -+ -+void bch2_btree_keys_init(struct btree *b) -+{ -+ unsigned i; -+ -+ b->nsets = 0; -+ memset(&b->nr, 0, sizeof(b->nr)); -+ -+ for (i = 0; i < MAX_BSETS; i++) -+ b->set[i].data_offset = U16_MAX; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+/* Binary tree stuff for auxiliary search trees */ -+ -+/* -+ * Cacheline/offset <-> bkey pointer arithmetic: -+ * -+ * t->tree is a binary search tree in an array; each node corresponds to a key -+ * in one cacheline in t->set (BSET_CACHELINE bytes). -+ * -+ * This means we don't have to store the full index of the key that a node in -+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and -+ * then bkey_float->m gives us the offset within that cacheline, in units of 8 -+ * bytes. -+ * -+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to -+ * make this work. -+ * -+ * To construct the bfloat for an arbitrary key we need to know what the key -+ * immediately preceding it is: we have to check if the two keys differ in the -+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size -+ * of the previous key so we can walk backwards to it from t->tree[j]'s key. -+ */ -+ -+static inline void *bset_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline) -+{ -+ return (void *) round_down((unsigned long) btree_bkey_first(b, t), -+ L1_CACHE_BYTES) + -+ cacheline * BSET_CACHELINE; -+} -+ -+static struct bkey_packed *cacheline_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ unsigned offset) -+{ -+ return bset_cacheline(b, t, cacheline) + offset * 8; -+} -+ -+static unsigned bkey_to_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ const struct bkey_packed *k) -+{ -+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -+} -+ -+static ssize_t __bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -+} -+ -+static unsigned bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); -+ -+ EBUG_ON(m > U8_MAX); -+ return m; -+} -+ -+static inline struct bkey_packed *tree_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ return cacheline_to_bkey(b, t, -+ __eytzinger1_to_inorder(j, t->size - 1, t->extra), -+ bkey_float(b, t, j)->key_offset); -+} -+ -+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; -+ -+ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); -+} -+ -+static struct rw_aux_tree *rw_aux_tree(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+/* -+ * For the write set - the one we're currently inserting keys into - we don't -+ * maintain a full search tree, we just keep a simple lookup table in t->prev. -+ */ -+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, -+ struct bset_tree *t, -+ unsigned j) -+{ -+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -+} -+ -+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, -+ unsigned j, struct bkey_packed *k) -+{ -+ EBUG_ON(k >= btree_bkey_last(b, t)); -+ -+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { -+ .offset = __btree_node_key_to_offset(b, k), -+ .k = bkey_unpack_pos(b, k), -+ }; -+} -+ -+static void bch2_bset_verify_rw_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ struct bkey_packed *k = btree_bkey_first(b, t); -+ unsigned j = 0; -+ -+ if (!bch2_expensive_debug_checks) -+ return; -+ -+ BUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ BUG_ON(t->size < 1); -+ BUG_ON(rw_aux_to_bkey(b, t, j) != k); -+ -+ goto start; -+ while (1) { -+ if (rw_aux_to_bkey(b, t, j) == k) { -+ BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, -+ bkey_unpack_pos(b, k))); -+start: -+ if (++j == t->size) -+ break; -+ -+ BUG_ON(rw_aux_tree(b, t)[j].offset <= -+ rw_aux_tree(b, t)[j - 1].offset); -+ } -+ -+ k = bkey_p_next(k); -+ BUG_ON(k >= btree_bkey_last(b, t)); -+ } -+} -+ -+/* returns idx of first entry >= offset: */ -+static unsigned rw_aux_tree_bsearch(struct btree *b, -+ struct bset_tree *t, -+ unsigned offset) -+{ -+ unsigned bset_offs = offset - btree_bkey_first_offset(t); -+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); -+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ EBUG_ON(!t->size); -+ EBUG_ON(idx > t->size); -+ -+ while (idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset) -+ idx++; -+ -+ while (idx && -+ rw_aux_tree(b, t)[idx - 1].offset >= offset) -+ idx--; -+ -+ EBUG_ON(idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset); -+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); -+ EBUG_ON(idx + 1 < t->size && -+ rw_aux_tree(b, t)[idx].offset == -+ rw_aux_tree(b, t)[idx + 1].offset); -+ -+ return idx; -+} -+ -+static inline unsigned bkey_mantissa(const struct bkey_packed *k, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+ u64 v; -+ -+ EBUG_ON(!bkey_packed(k)); -+ -+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); -+ -+ /* -+ * In little endian, we're shifting off low bits (and then the bits we -+ * want are at the low end), in big endian we're shifting off high bits -+ * (and then the bits we want are at the high end, so we shift them -+ * back down): -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ v >>= f->exponent & 7; -+#else -+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -+#endif -+ return (u16) v; -+} -+ -+static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) -+{ -+ struct bkey_float *f = bkey_float(b, t, j); -+ struct bkey_packed *m = tree_to_bkey(b, t, j); -+ struct bkey_packed *l = is_power_of_2(j) -+ ? min_key -+ : tree_to_prev_bkey(b, t, j >> ffs(j)); -+ struct bkey_packed *r = is_power_of_2(j + 1) -+ ? max_key -+ : tree_to_bkey(b, t, j >> (ffz(j) + 1)); -+ unsigned mantissa; -+ int shift, exponent, high_bit; -+ -+ /* -+ * for failed bfloats, the lookup code falls back to comparing against -+ * the original key. -+ */ -+ -+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || -+ !b->nr_key_bits) { -+ f->exponent = BFLOAT_FAILED_UNPACKED; -+ return; -+ } -+ -+ /* -+ * The greatest differing bit of l and r is the first bit we must -+ * include in the bfloat mantissa we're creating in order to do -+ * comparisons - that bit always becomes the high bit of -+ * bfloat->mantissa, and thus the exponent we're calculating here is -+ * the position of what will become the low bit in bfloat->mantissa: -+ * -+ * Note that this may be negative - we may be running off the low end -+ * of the key: we handle this later: -+ */ -+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), -+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); -+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); -+ -+ /* -+ * Then we calculate the actual shift value, from the start of the key -+ * (k->_data), to get the key bits starting at exponent: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; -+ -+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -+#else -+ shift = high_bit_offset + -+ b->nr_key_bits - -+ exponent - -+ BKEY_MANTISSA_BITS; -+ -+ EBUG_ON(shift < KEY_PACKED_BITS_START); -+#endif -+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); -+ -+ f->exponent = shift; -+ mantissa = bkey_mantissa(m, f, j); -+ -+ /* -+ * If we've got garbage bits, set them to all 1s - it's legal for the -+ * bfloat to compare larger than the original key, but not smaller: -+ */ -+ if (exponent < 0) -+ mantissa |= ~(~0U << -exponent); -+ -+ f->mantissa = mantissa; -+} -+ -+/* bytes remaining - only valid for last bset: */ -+static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) -+{ -+ bset_aux_tree_verify(b); -+ -+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -+} -+ -+static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / -+ (sizeof(struct bkey_float) + sizeof(u8)); -+} -+ -+static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -+} -+ -+static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *k; -+ -+ t->size = 1; -+ t->extra = BSET_RW_AUX_TREE_VAL; -+ rw_aux_tree(b, t)[0].offset = -+ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); -+ -+ bset_tree_for_each_key(b, t, k) { -+ if (t->size == bset_rw_tree_capacity(b, t)) -+ break; -+ -+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > -+ L1_CACHE_BYTES) -+ rw_aux_tree_set(b, t, t->size++, k); -+ } -+} -+ -+static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); -+ struct bkey_i min_key, max_key; -+ unsigned j, cacheline = 1; -+ -+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), -+ bset_ro_tree_capacity(b, t)); -+retry: -+ if (t->size < 2) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ return; -+ } -+ -+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; -+ -+ /* First we figure out where the first key in each cacheline is */ -+ eytzinger1_for_each(j, t->size - 1) { -+ while (bkey_to_cacheline(b, t, k) < cacheline) -+ prev = k, k = bkey_p_next(k); -+ -+ if (k >= btree_bkey_last(b, t)) { -+ /* XXX: this path sucks */ -+ t->size--; -+ goto retry; -+ } -+ -+ ro_aux_tree_prev(b, t)[j] = prev->u64s; -+ bkey_float(b, t, j)->key_offset = -+ bkey_to_cacheline_offset(b, t, cacheline++, k); -+ -+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); -+ EBUG_ON(tree_to_bkey(b, t, j) != k); -+ } -+ -+ while (k != btree_bkey_last(b, t)) -+ prev = k, k = bkey_p_next(k); -+ -+ if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { -+ bkey_init(&min_key.k); -+ min_key.k.p = b->data->min_key; -+ } -+ -+ if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { -+ bkey_init(&max_key.k); -+ max_key.k.p = b->data->max_key; -+ } -+ -+ /* Then we build the tree */ -+ eytzinger1_for_each(j, t->size - 1) -+ make_bfloat(b, t, j, -+ bkey_to_packed(&min_key), -+ bkey_to_packed(&max_key)); -+} -+ -+static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bset_tree *i; -+ -+ for (i = b->set; i != t; i++) -+ BUG_ON(bset_has_rw_aux_tree(i)); -+ -+ bch2_bset_set_no_aux_tree(b, t); -+ -+ /* round up to next cacheline: */ -+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), -+ SMP_CACHE_BYTES / sizeof(u64)); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, -+ bool writeable) -+{ -+ if (writeable -+ ? bset_has_rw_aux_tree(t) -+ : bset_has_ro_aux_tree(t)) -+ return; -+ -+ bset_alloc_tree(b, t); -+ -+ if (!__bset_tree_capacity(b, t)) -+ return; -+ -+ if (writeable) -+ __build_rw_aux_tree(b, t); -+ else -+ __build_ro_aux_tree(b, t); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_init_first(struct btree *b, struct bset *i) -+{ -+ struct bset_tree *t; -+ -+ BUG_ON(b->nsets); -+ -+ memset(i, 0, sizeof(*i)); -+ get_random_bytes(&i->seq, sizeof(i->seq)); -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+void bch2_bset_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_node_entry *bne) -+{ -+ struct bset *i = &bne->keys; -+ struct bset_tree *t; -+ -+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); -+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); -+ BUG_ON(b->nsets >= MAX_BSETS); -+ -+ memset(i, 0, sizeof(*i)); -+ i->seq = btree_bset_first(b)->seq; -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+/* -+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the -+ * immediate predecessor: -+ */ -+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed *p; -+ unsigned offset; -+ int j; -+ -+ EBUG_ON(k < btree_bkey_first(b, t) || -+ k > btree_bkey_last(b, t)); -+ -+ if (k == btree_bkey_first(b, t)) -+ return NULL; -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ p = btree_bkey_first(b, t); -+ break; -+ case BSET_RO_AUX_TREE: -+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); -+ -+ do { -+ p = j ? tree_to_bkey(b, t, -+ __inorder_to_eytzinger1(j--, -+ t->size - 1, t->extra)) -+ : btree_bkey_first(b, t); -+ } while (p >= k); -+ break; -+ case BSET_RW_AUX_TREE: -+ offset = __btree_node_key_to_offset(b, k); -+ j = rw_aux_tree_bsearch(b, t, offset); -+ p = j ? rw_aux_to_bkey(b, t, j - 1) -+ : btree_bkey_first(b, t); -+ break; -+ } -+ -+ return p; -+} -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; -+ -+ while ((p = __bkey_prev(b, t, k)) && !ret) { -+ for (i = p; i != k; i = bkey_p_next(i)) -+ if (i->type >= min_key_type) -+ ret = i; -+ -+ k = p; -+ } -+ -+ if (bch2_expensive_debug_checks) { -+ BUG_ON(ret >= orig_k); -+ -+ for (i = ret -+ ? bkey_p_next(ret) -+ : btree_bkey_first(b, t); -+ i != orig_k; -+ i = bkey_p_next(i)) -+ BUG_ON(i->type >= min_key_type); -+ } -+ -+ return ret; -+} -+ -+/* Insert */ -+ -+static void bch2_bset_fix_lookup_table(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *_where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ int shift = new_u64s - clobber_u64s; -+ unsigned l, j, where = __btree_node_key_to_offset(b, _where); -+ -+ EBUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ /* returns first entry >= where */ -+ l = rw_aux_tree_bsearch(b, t, where); -+ -+ if (!l) /* never delete first entry */ -+ l++; -+ else if (l < t->size && -+ where < t->end_offset && -+ rw_aux_tree(b, t)[l].offset == where) -+ rw_aux_tree_set(b, t, l++, _where); -+ -+ /* l now > where */ -+ -+ for (j = l; -+ j < t->size && -+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; -+ j++) -+ ; -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset + shift == -+ rw_aux_tree(b, t)[l - 1].offset) -+ j++; -+ -+ memmove(&rw_aux_tree(b, t)[l], -+ &rw_aux_tree(b, t)[j], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[j]); -+ t->size -= j - l; -+ -+ for (j = l; j < t->size; j++) -+ rw_aux_tree(b, t)[j].offset += shift; -+ -+ EBUG_ON(l < t->size && -+ rw_aux_tree(b, t)[l].offset == -+ rw_aux_tree(b, t)[l - 1].offset); -+ -+ if (t->size < bset_rw_tree_capacity(b, t) && -+ (l < t->size -+ ? rw_aux_tree(b, t)[l].offset -+ : t->end_offset) - -+ rw_aux_tree(b, t)[l - 1].offset > -+ L1_CACHE_BYTES / sizeof(u64)) { -+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); -+ struct bkey_packed *end = l < t->size -+ ? rw_aux_to_bkey(b, t, l) -+ : btree_bkey_last(b, t); -+ struct bkey_packed *k = start; -+ -+ while (1) { -+ k = bkey_p_next(k); -+ if (k == end) -+ break; -+ -+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { -+ memmove(&rw_aux_tree(b, t)[l + 1], -+ &rw_aux_tree(b, t)[l], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[l]); -+ t->size++; -+ rw_aux_tree_set(b, t, l, k); -+ break; -+ } -+ } -+ } -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_insert(struct btree *b, -+ struct btree_node_iter *iter, -+ struct bkey_packed *where, -+ struct bkey_i *insert, -+ unsigned clobber_u64s) -+{ -+ struct bkey_format *f = &b->format; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bkey_packed packed, *src = bkey_to_packed(insert); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); -+ -+ if (bch2_bkey_pack_key(&packed, &insert->k, f)) -+ src = &packed; -+ -+ if (!bkey_deleted(&insert->k)) -+ btree_keys_account_key_add(&b->nr, t - b->set, src); -+ -+ if (src->u64s != clobber_u64s) { -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data + src->u64s; -+ -+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < -+ (int) clobber_u64s - src->u64s); -+ -+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); -+ set_btree_bset_end(b, t); -+ } -+ -+ memcpy_u64s_small(where, src, -+ bkeyp_key_u64s(f, src)); -+ memcpy_u64s(bkeyp_val(f, where), &insert->v, -+ bkeyp_val_u64s(f, src)); -+ -+ if (src->u64s != clobber_u64s) -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_bset_delete(struct btree *b, -+ struct bkey_packed *where, -+ unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data; -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ -+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); -+ -+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); -+ set_btree_bset_end(b, t); -+ -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -+} -+ -+/* Lookup */ -+ -+__flatten -+static struct bkey_packed *bset_search_write_set(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search) -+{ -+ unsigned l = 0, r = t->size; -+ -+ while (l + 1 != r) { -+ unsigned m = (l + r) >> 1; -+ -+ if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) -+ l = m; -+ else -+ r = m; -+ } -+ -+ return rw_aux_to_bkey(b, t, l); -+} -+ -+static inline void prefetch_four_cachelines(void *p) -+{ -+#ifdef CONFIG_X86_64 -+ asm("prefetcht0 (-127 + 64 * 0)(%0);" -+ "prefetcht0 (-127 + 64 * 1)(%0);" -+ "prefetcht0 (-127 + 64 * 2)(%0);" -+ "prefetcht0 (-127 + 64 * 3)(%0);" -+ : -+ : "r" (p + 127)); -+#else -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ prefetch(p + L1_CACHE_BYTES * 3); -+#endif -+} -+ -+static inline bool bkey_mantissa_bits_dropped(const struct btree *b, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; -+ -+ return f->exponent > key_bits_start; -+#else -+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; -+ -+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -+#endif -+} -+ -+__flatten -+static struct bkey_packed *bset_search_tree(const struct btree *b, -+ const struct bset_tree *t, -+ const struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ struct ro_aux_tree *base = ro_aux_tree_base(b, t); -+ struct bkey_float *f; -+ struct bkey_packed *k; -+ unsigned inorder, n = 1, l, r; -+ int cmp; -+ -+ do { -+ if (likely(n << 4 < t->size)) -+ prefetch(&base->f[n << 4]); -+ -+ f = &base->f[n]; -+ if (unlikely(f->exponent >= BFLOAT_FAILED)) -+ goto slowpath; -+ -+ l = f->mantissa; -+ r = bkey_mantissa(packed_search, f, n); -+ -+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) -+ goto slowpath; -+ -+ n = n * 2 + (l < r); -+ continue; -+slowpath: -+ k = tree_to_bkey(b, t, n); -+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); -+ if (!cmp) -+ return k; -+ -+ n = n * 2 + (cmp < 0); -+ } while (n < t->size); -+ -+ inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); -+ -+ /* -+ * n would have been the node we recursed to - the low bit tells us if -+ * we recursed left or recursed right. -+ */ -+ if (likely(!(n & 1))) { -+ --inorder; -+ if (unlikely(!inorder)) -+ return btree_bkey_first(b, t); -+ -+ f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; -+ } -+ -+ return cacheline_to_bkey(b, t, inorder, f->key_offset); -+} -+ -+static __always_inline __flatten -+struct bkey_packed *__bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ -+ /* -+ * First, we search for a cacheline, then lastly we do a linear search -+ * within that cacheline. -+ * -+ * To search for the cacheline, there's three different possibilities: -+ * * The set is too small to have a search tree, so we just do a linear -+ * search over the whole set. -+ * * The set is the one we're currently inserting into; keeping a full -+ * auxiliary search tree up to date would be too expensive, so we -+ * use a much simpler lookup table to do a binary search - -+ * bset_search_write_set(). -+ * * Or we use the auxiliary search tree we constructed earlier - -+ * bset_search_tree() -+ */ -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return btree_bkey_first(b, t); -+ case BSET_RW_AUX_TREE: -+ return bset_search_write_set(b, t, search); -+ case BSET_RO_AUX_TREE: -+ return bset_search_tree(b, t, search, lossy_packed_search); -+ default: -+ unreachable(); -+ } -+} -+ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search_linear(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search, -+ struct bkey_packed *m) -+{ -+ if (lossy_packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_cmp_p_or_unp(b, m, -+ lossy_packed_search, search) < 0) -+ m = bkey_p_next(m); -+ -+ if (!packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_pos_cmp(b, m, search) < 0) -+ m = bkey_p_next(m); -+ -+ if (bch2_expensive_debug_checks) { -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); -+ -+ BUG_ON(prev && -+ bkey_iter_cmp_p_or_unp(b, prev, -+ packed_search, search) >= 0); -+ } -+ -+ return m; -+} -+ -+/* Btree node iterator */ -+ -+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ if (k != end) { -+ struct btree_node_iter_set *pos; -+ -+ btree_node_iter_for_each(iter, pos) -+ ; -+ -+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); -+ *pos = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+} -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ __bch2_btree_node_iter_push(iter, b, k, end); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+noinline __flatten __cold -+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bkey_packed *k; -+ -+ trace_bkey_pack_pos_fail(search); -+ -+ bch2_btree_node_iter_init_from_start(iter, b); -+ -+ while ((k = bch2_btree_node_iter_peek(iter, b)) && -+ bkey_iter_pos_cmp(b, k, search) < 0) -+ bch2_btree_node_iter_advance(iter, b); -+} -+ -+/** -+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a -+ * given position -+ * -+ * Main entry point to the lookup code for individual btree nodes: -+ * -+ * NOTE: -+ * -+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate -+ * keys. This doesn't matter for most code, but it does matter for lookups. -+ * -+ * Some adjacent keys with a string of equal keys: -+ * i j k k k k l m -+ * -+ * If you search for k, the lookup code isn't guaranteed to return you any -+ * specific k. The lookup code is conceptually doing a binary search and -+ * iterating backwards is very expensive so if the pivot happens to land at the -+ * last k that's what you'll get. -+ * -+ * This works out ok, but it's something to be aware of: -+ * -+ * - For non extents, we guarantee that the live key comes last - see -+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't -+ * see will only be deleted keys you don't care about. -+ * -+ * - For extents, deleted keys sort last (see the comment at the top of this -+ * file). But when you're searching for extents, you actually want the first -+ * key strictly greater than your search key - an extent that compares equal -+ * to the search key is going to have 0 sectors after the search key. -+ * -+ * But this does mean that we can't just search for -+ * bpos_successor(start_of_range) to get the first extent that overlaps with -+ * the range we want - if we're unlucky and there's an extent that ends -+ * exactly where we searched, then there could be a deleted key at the same -+ * position and we'd get that when we search instead of the preceding extent -+ * we needed. -+ * -+ * So we've got to search for start_of_range, then after the lookup iterate -+ * past any extents that compare equal to the position we searched for. -+ */ -+__flatten -+void bch2_btree_node_iter_init(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bkey_packed p, *packed_search = NULL; -+ struct btree_node_iter_set *pos = iter->data; -+ struct bkey_packed *k[MAX_BSETS]; -+ unsigned i; -+ -+ EBUG_ON(bpos_lt(*search, b->data->min_key)); -+ EBUG_ON(bpos_gt(*search, b->data->max_key)); -+ bset_aux_tree_verify(b); -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { -+ case BKEY_PACK_POS_EXACT: -+ packed_search = &p; -+ break; -+ case BKEY_PACK_POS_SMALLER: -+ packed_search = NULL; -+ break; -+ case BKEY_PACK_POS_FAIL: -+ btree_node_iter_init_pack_failed(iter, b, search); -+ return; -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ k[i] = __bch2_bset_search(b, b->set + i, search, &p); -+ prefetch_four_cachelines(k[i]); -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ struct bset_tree *t = b->set + i; -+ struct bkey_packed *end = btree_bkey_last(b, t); -+ -+ k[i] = bch2_bset_search_linear(b, t, search, -+ packed_search, &p, k[i]); -+ if (k[i] != end) -+ *pos++ = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k[i]), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) -+ return __btree_node_offset_to_key(b, set->k); -+ -+ return btree_bkey_last(b, t); -+} -+ -+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned first) -+{ -+ bool ret; -+ -+ if ((ret = (btree_node_iter_cmp(b, -+ iter->data[first], -+ iter->data[first + 1]) > 0))) -+ swap(iter->data[first], iter->data[first + 1]); -+ return ret; -+} -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ /* unrolled bubble sort: */ -+ -+ if (!__btree_node_iter_set_end(iter, 2)) { -+ btree_node_iter_sort_two(iter, b, 0); -+ btree_node_iter_sort_two(iter, b, 1); -+ } -+ -+ if (!__btree_node_iter_set_end(iter, 1)) -+ btree_node_iter_sort_two(iter, b, 0); -+} -+ -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, -+ struct btree_node_iter_set *set) -+{ -+ struct btree_node_iter_set *last = -+ iter->data + ARRAY_SIZE(iter->data) - 1; -+ -+ memmove(&set[0], &set[1], (void *) last - (void *) set); -+ *last = (struct btree_node_iter_set) { 0, 0 }; -+} -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; -+ -+ EBUG_ON(iter->data->k > iter->data->end); -+ -+ if (unlikely(__btree_node_iter_set_end(iter, 0))) { -+ /* avoid an expensive memmove call: */ -+ iter->data[0] = iter->data[1]; -+ iter->data[1] = iter->data[2]; -+ iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; -+ return; -+ } -+ -+ if (__btree_node_iter_set_end(iter, 1)) -+ return; -+ -+ if (!btree_node_iter_sort_two(iter, b, 0)) -+ return; -+ -+ if (__btree_node_iter_set_end(iter, 2)) -+ return; -+ -+ btree_node_iter_sort_two(iter, b, 1); -+} -+ -+void bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ if (bch2_expensive_debug_checks) { -+ bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_next_check(iter, b); -+ } -+ -+ __bch2_btree_node_iter_advance(iter, b); -+} -+ -+/* -+ * Expensive: -+ */ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bkey_packed *k, *prev = NULL; -+ struct btree_node_iter_set *set; -+ struct bset_tree *t; -+ unsigned end = 0; -+ -+ if (bch2_expensive_debug_checks) -+ bch2_btree_node_iter_verify(iter, b); -+ -+ for_each_bset(b, t) { -+ k = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ if (k && -+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { -+ prev = k; -+ end = t->end_offset; -+ } -+ } -+ -+ if (!prev) -+ return NULL; -+ -+ /* -+ * We're manually memmoving instead of just calling sort() to ensure the -+ * prev we picked ends up in slot 0 - sort won't necessarily put it -+ * there because of duplicate deleted keys: -+ */ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == end) -+ goto found; -+ -+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -+found: -+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); -+ -+ memmove(&iter->data[1], -+ &iter->data[0], -+ (void *) set - (void *) &iter->data[0]); -+ -+ iter->data[0].k = __btree_node_key_to_offset(b, prev); -+ iter->data[0].end = end; -+ -+ if (bch2_expensive_debug_checks) -+ bch2_btree_node_iter_verify(iter, b); -+ return prev; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bkey_packed *prev; -+ -+ do { -+ prev = bch2_btree_node_iter_prev_all(iter, b); -+ } while (prev && bkey_deleted(prev)); -+ -+ return prev; -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bkey *u) -+{ -+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); -+ -+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -+} -+ -+/* Mergesort */ -+ -+void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) -+{ -+ const struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ enum bset_aux_tree_type type = bset_aux_tree_type(t); -+ size_t j; -+ -+ stats->sets[type].nr++; -+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * -+ sizeof(u64); -+ -+ if (bset_has_ro_aux_tree(t)) { -+ stats->floats += t->size - 1; -+ -+ for (j = 1; j < t->size; j++) -+ stats->failed += -+ bkey_float(b, t, j)->exponent == -+ BFLOAT_FAILED; -+ } -+ } -+} -+ -+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, -+ struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk; -+ unsigned j, inorder; -+ -+ if (!bset_has_ro_aux_tree(t)) -+ return; -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ if (!inorder || inorder >= t->size) -+ return; -+ -+ j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); -+ if (k != tree_to_bkey(b, t, j)) -+ return; -+ -+ switch (bkey_float(b, t, j)->exponent) { -+ case BFLOAT_FAILED: -+ uk = bkey_unpack_key(b, k); -+ prt_printf(out, -+ " failed unpacked at depth %u\n" -+ "\t", -+ ilog2(j)); -+ bch2_bpos_to_text(out, uk.p); -+ prt_printf(out, "\n"); -+ break; -+ } -+} -diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h -new file mode 100644 -index 000000000..632c2b8c5 ---- /dev/null -+++ b/fs/bcachefs/bset.h -@@ -0,0 +1,541 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BSET_H -+#define _BCACHEFS_BSET_H -+ -+#include -+#include -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "util.h" /* for time_stats */ -+#include "vstructs.h" -+ -+/* -+ * BKEYS: -+ * -+ * A bkey contains a key, a size field, a variable number of pointers, and some -+ * ancillary flag bits. -+ * -+ * We use two different functions for validating bkeys, bkey_invalid and -+ * bkey_deleted(). -+ * -+ * The one exception to the rule that ptr_invalid() filters out invalid keys is -+ * that it also filters out keys of size 0 - these are keys that have been -+ * completely overwritten. It'd be safe to delete these in memory while leaving -+ * them on disk, just unnecessary work - so we filter them out when resorting -+ * instead. -+ * -+ * We can't filter out stale keys when we're resorting, because garbage -+ * collection needs to find them to ensure bucket gens don't wrap around - -+ * unless we're rewriting the btree node those stale keys still exist on disk. -+ * -+ * We also implement functions here for removing some number of sectors from the -+ * front or the back of a bkey - this is mainly used for fixing overlapping -+ * extents, by removing the overlapping sectors from the older key. -+ * -+ * BSETS: -+ * -+ * A bset is an array of bkeys laid out contiguously in memory in sorted order, -+ * along with a header. A btree node is made up of a number of these, written at -+ * different times. -+ * -+ * There could be many of them on disk, but we never allow there to be more than -+ * 4 in memory - we lazily resort as needed. -+ * -+ * We implement code here for creating and maintaining auxiliary search trees -+ * (described below) for searching an individial bset, and on top of that we -+ * implement a btree iterator. -+ * -+ * BTREE ITERATOR: -+ * -+ * Most of the code in bcache doesn't care about an individual bset - it needs -+ * to search entire btree nodes and iterate over them in sorted order. -+ * -+ * The btree iterator code serves both functions; it iterates through the keys -+ * in a btree node in sorted order, starting from either keys after a specific -+ * point (if you pass it a search key) or the start of the btree node. -+ * -+ * AUXILIARY SEARCH TREES: -+ * -+ * Since keys are variable length, we can't use a binary search on a bset - we -+ * wouldn't be able to find the start of the next key. But binary searches are -+ * slow anyways, due to terrible cache behaviour; bcache originally used binary -+ * searches and that code topped out at under 50k lookups/second. -+ * -+ * So we need to construct some sort of lookup table. Since we only insert keys -+ * into the last (unwritten) set, most of the keys within a given btree node are -+ * usually in sets that are mostly constant. We use two different types of -+ * lookup tables to take advantage of this. -+ * -+ * Both lookup tables share in common that they don't index every key in the -+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search -+ * is used for the rest. -+ * -+ * For sets that have been written to disk and are no longer being inserted -+ * into, we construct a binary search tree in an array - traversing a binary -+ * search tree in an array gives excellent locality of reference and is very -+ * fast, since both children of any node are adjacent to each other in memory -+ * (and their grandchildren, and great grandchildren...) - this means -+ * prefetching can be used to great effect. -+ * -+ * It's quite useful performance wise to keep these nodes small - not just -+ * because they're more likely to be in L2, but also because we can prefetch -+ * more nodes on a single cacheline and thus prefetch more iterations in advance -+ * when traversing this tree. -+ * -+ * Nodes in the auxiliary search tree must contain both a key to compare against -+ * (we don't want to fetch the key from the set, that would defeat the purpose), -+ * and a pointer to the key. We use a few tricks to compress both of these. -+ * -+ * To compress the pointer, we take advantage of the fact that one node in the -+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have -+ * a function (to_inorder()) that takes the index of a node in a binary tree and -+ * returns what its index would be in an inorder traversal, so we only have to -+ * store the low bits of the offset. -+ * -+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To -+ * compress that, we take advantage of the fact that when we're traversing the -+ * search tree at every iteration we know that both our search key and the key -+ * we're looking for lie within some range - bounded by our previous -+ * comparisons. (We special case the start of a search so that this is true even -+ * at the root of the tree). -+ * -+ * So we know the key we're looking for is between a and b, and a and b don't -+ * differ higher than bit 50, we don't need to check anything higher than bit -+ * 50. -+ * -+ * We don't usually need the rest of the bits, either; we only need enough bits -+ * to partition the key range we're currently checking. Consider key n - the -+ * key our auxiliary search tree node corresponds to, and key p, the key -+ * immediately preceding n. The lowest bit we need to store in the auxiliary -+ * search tree is the highest bit that differs between n and p. -+ * -+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the -+ * comparison. But we'd really like our nodes in the auxiliary search tree to be -+ * of fixed size. -+ * -+ * The solution is to make them fixed size, and when we're constructing a node -+ * check if p and n differed in the bits we needed them to. If they don't we -+ * flag that node, and when doing lookups we fallback to comparing against the -+ * real key. As long as this doesn't happen to often (and it seems to reliably -+ * happen a bit less than 1% of the time), we win - even on failures, that key -+ * is then more likely to be in cache than if we were doing binary searches all -+ * the way, since we're touching so much less memory. -+ * -+ * The keys in the auxiliary search tree are stored in (software) floating -+ * point, with an exponent and a mantissa. The exponent needs to be big enough -+ * to address all the bits in the original key, but the number of bits in the -+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. -+ * -+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys -+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. -+ * We need one node per 128 bytes in the btree node, which means the auxiliary -+ * search trees take up 3% as much memory as the btree itself. -+ * -+ * Constructing these auxiliary search trees is moderately expensive, and we -+ * don't want to be constantly rebuilding the search tree for the last set -+ * whenever we insert another key into it. For the unwritten set, we use a much -+ * simpler lookup table - it's just a flat array, so index i in the lookup table -+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing -+ * within each byte range works the same as with the auxiliary search trees. -+ * -+ * These are much easier to keep up to date when we insert a key - we do it -+ * somewhat lazily; when we shift a key up we usually just increment the pointer -+ * to it, only when it would overflow do we go to the trouble of finding the -+ * first key in that range of bytes again. -+ */ -+ -+enum bset_aux_tree_type { -+ BSET_NO_AUX_TREE, -+ BSET_RO_AUX_TREE, -+ BSET_RW_AUX_TREE, -+}; -+ -+#define BSET_TREE_NR_TYPES 3 -+ -+#define BSET_NO_AUX_TREE_VAL (U16_MAX) -+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) -+ -+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -+{ -+ switch (t->extra) { -+ case BSET_NO_AUX_TREE_VAL: -+ EBUG_ON(t->size); -+ return BSET_NO_AUX_TREE; -+ case BSET_RW_AUX_TREE_VAL: -+ EBUG_ON(!t->size); -+ return BSET_RW_AUX_TREE; -+ default: -+ EBUG_ON(!t->size); -+ return BSET_RO_AUX_TREE; -+ } -+} -+ -+/* -+ * BSET_CACHELINE was originally intended to match the hardware cacheline size - -+ * it used to be 64, but I realized the lookup code would touch slightly less -+ * memory if it was 128. -+ * -+ * It definites the number of bytes (in struct bset) per struct bkey_float in -+ * the auxiliar search tree - when we're done searching the bset_float tree we -+ * have this many bytes left that we do a linear search over. -+ * -+ * Since (after level 5) every level of the bset_tree is on a new cacheline, -+ * we're touching one fewer cacheline in the bset tree in exchange for one more -+ * cacheline in the linear search - but the linear search might stop before it -+ * gets to the second cacheline. -+ */ -+ -+#define BSET_CACHELINE 256 -+ -+static inline size_t btree_keys_cachelines(const struct btree *b) -+{ -+ return (1U << b->byte_order) / BSET_CACHELINE; -+} -+ -+static inline size_t btree_aux_data_bytes(const struct btree *b) -+{ -+ return btree_keys_cachelines(b) * 8; -+} -+ -+static inline size_t btree_aux_data_u64s(const struct btree *b) -+{ -+ return btree_aux_data_bytes(b) / sizeof(u64); -+} -+ -+#define for_each_bset(_b, _t) \ -+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) -+ -+#define bset_tree_for_each_key(_b, _t, _k) \ -+ for (_k = btree_bkey_first(_b, _t); \ -+ _k != btree_bkey_last(_b, _t); \ -+ _k = bkey_p_next(_k)) -+ -+static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -+} -+ -+static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -+} -+ -+static inline void bch2_bset_set_no_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ BUG_ON(t < b->set); -+ -+ for (; t < b->set + ARRAY_SIZE(b->set); t++) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ t->aux_data_offset = U16_MAX; -+ } -+} -+ -+static inline void btree_node_set_format(struct btree *b, -+ struct bkey_format f) -+{ -+ int len; -+ -+ b->format = f; -+ b->nr_key_bits = bkey_format_key_bits(&f); -+ -+ len = bch2_compile_bkey_format(&b->format, b->aux_data); -+ BUG_ON(len < 0 || len > U8_MAX); -+ -+ b->unpack_fn_len = len; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+static inline struct bset *bset_next_set(struct btree *b, -+ unsigned block_bytes) -+{ -+ struct bset *i = btree_bset_last(b); -+ -+ EBUG_ON(!is_power_of_2(block_bytes)); -+ -+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -+} -+ -+void bch2_btree_keys_init(struct btree *); -+ -+void bch2_bset_init_first(struct btree *, struct bset *); -+void bch2_bset_init_next(struct bch_fs *, struct btree *, -+ struct btree_node_entry *); -+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -+ -+void bch2_bset_insert(struct btree *, struct btree_node_iter *, -+ struct bkey_packed *, struct bkey_i *, unsigned); -+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); -+ -+/* Bkey utility code */ -+ -+/* packed or unpacked */ -+static inline int bkey_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ EBUG_ON(r_packed && !bkey_packed(r_packed)); -+ -+ if (unlikely(!bkey_packed(l))) -+ return bpos_cmp(packed_to_bkey_c(l)->p, *r); -+ -+ if (likely(r_packed)) -+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); -+ -+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+static inline struct bset_tree * -+bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (offset <= t->end_offset) { -+ EBUG_ON(offset < btree_bkey_first_offset(t)); -+ return t; -+ } -+ -+ BUG(); -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, -+ struct bkey_packed *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, 1); -+} -+ -+/* Btree key iteration */ -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, -+ struct bpos *); -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, -+ struct btree *, -+ struct bset_tree *); -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *, -+ struct btree_node_iter_set *); -+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); -+ -+#define btree_node_iter_for_each(_iter, _set) \ -+ for (_set = (_iter)->data; \ -+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ -+ (_set)->k != (_set)->end; \ -+ _set++) -+ -+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, -+ unsigned i) -+{ -+ return iter->data[i].k == iter->data[i].end; -+} -+ -+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -+{ -+ return __btree_node_iter_set_end(iter, 0); -+} -+ -+/* -+ * When keys compare equal, deleted keys compare first: -+ * -+ * XXX: only need to compare pointers for keys that are both within a -+ * btree_node_iterator - we need to break ties for prev() to work correctly -+ */ -+static inline int bkey_iter_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bch2_bkey_cmp_packed(b, l, r) -+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) -+ ?: cmp_int(l, r); -+} -+ -+static inline int btree_node_iter_cmp(const struct btree *b, -+ struct btree_node_iter_set l, -+ struct btree_node_iter_set r) -+{ -+ return bkey_iter_cmp(b, -+ __btree_node_offset_to_key(b, l.k), -+ __btree_node_offset_to_key(b, r.k)); -+} -+ -+/* These assume r (the search key) is not a deleted key: */ -+static inline int bkey_iter_pos_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp_left_packed(b, l, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ return bkey_cmp_p_or_unp(b, l, r_packed, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline struct bkey_packed * -+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return __btree_node_offset_to_key(b, iter->data->k); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) -+{ -+ return !bch2_btree_node_iter_end(iter) -+ ? __btree_node_offset_to_key(b, iter->data->k) -+ : NULL; -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -+{ -+ struct bkey_packed *k; -+ -+ while ((k = bch2_btree_node_iter_peek_all(iter, b)) && -+ bkey_deleted(k)) -+ bch2_btree_node_iter_advance(iter, b); -+ -+ return k; -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -+{ -+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (ret) -+ bch2_btree_node_iter_advance(iter, b); -+ -+ return ret; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, -+ struct btree *); -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, -+ struct btree *, -+ struct bkey *); -+ -+#define for_each_btree_node_key(b, k, iter) \ -+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ -+ (k = bch2_btree_node_iter_peek((iter), (b))); \ -+ bch2_btree_node_iter_advance(iter, b)) -+ -+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ -+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ -+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ -+ bch2_btree_node_iter_advance(iter, b)) -+ -+/* Accounting: */ -+ -+static inline void btree_keys_account_key(struct btree_nr_keys *n, -+ unsigned bset, -+ struct bkey_packed *k, -+ int sign) -+{ -+ n->live_u64s += k->u64s * sign; -+ n->bset_u64s[bset] += k->u64s * sign; -+ -+ if (bkey_packed(k)) -+ n->packed_keys += sign; -+ else -+ n->unpacked_keys += sign; -+} -+ -+static inline void btree_keys_account_val_delta(struct btree *b, -+ struct bkey_packed *k, -+ int delta) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ b->nr.live_u64s += delta; -+ b->nr.bset_u64s[t - b->set] += delta; -+} -+ -+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, 1) -+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, -1) -+ -+#define btree_account_key_add(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -+#define btree_account_key_drop(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) -+ -+struct bset_stats { -+ struct { -+ size_t nr, bytes; -+ } sets[BSET_TREE_NR_TYPES]; -+ -+ size_t floats; -+ size_t failed; -+}; -+ -+void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); -+void bch2_bfloat_to_text(struct printbuf *, struct btree *, -+ struct bkey_packed *); -+ -+/* Debug stuff */ -+ -+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -+void bch2_dump_btree_node(struct bch_fs *, struct btree *); -+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *); -+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, -+ struct bkey_packed *, unsigned); -+ -+#else -+ -+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} -+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) {} -+static inline void bch2_verify_insert_pos(struct btree *b, -+ struct bkey_packed *where, -+ struct bkey_packed *insert, -+ unsigned clobber_u64s) {} -+#endif -+ -+static inline void bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ if (bch2_debug_check_btree_accounting) -+ __bch2_verify_btree_nr_keys(b); -+} -+ -+#endif /* _BCACHEFS_BSET_H */ -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -new file mode 100644 -index 000000000..a8283fdc7 ---- /dev/null -+++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1274 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "debug.h" -+#include "errcode.h" -+#include "error.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+ -+#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -+do { \ -+ if (shrinker_counter) \ -+ bc->not_freed_##counter++; \ -+} while (0) -+ -+const char * const bch2_btree_node_flags[] = { -+#define x(f) #f, -+ BTREE_FLAGS() -+#undef x -+ NULL -+}; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *c) -+{ -+ unsigned i, reserve = 16; -+ -+ if (!c->btree_roots_known[0].b) -+ reserve += 8; -+ -+ for (i = 0; i < btree_id_nr_alive(c); i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (r->b) -+ reserve += min_t(unsigned, 1, r->b->c.level) * 8; -+ } -+ -+ c->btree_cache.reserve = reserve; -+} -+ -+static inline unsigned btree_cache_can_free(struct btree_cache *bc) -+{ -+ return max_t(int, 0, bc->used - bc->reserve); -+} -+ -+static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) -+{ -+ if (b->c.lock.readers) -+ list_move(&b->list, &bc->freed_pcpu); -+ else -+ list_move(&b->list, &bc->freed_nonpcpu); -+} -+ -+static void btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ EBUG_ON(btree_node_write_in_flight(b)); -+ -+ clear_btree_node_just_written(b); -+ -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+#ifdef __KERNEL__ -+ kvfree(b->aux_data); -+#else -+ munmap(b->aux_data, btree_aux_data_bytes(b)); -+#endif -+ b->aux_data = NULL; -+ -+ bc->used--; -+ -+ btree_node_to_freedlist(bc, b); -+} -+ -+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct btree *b = obj; -+ const u64 *v = arg->key; -+ -+ return b->hash_val == *v ? 0 : 1; -+} -+ -+static const struct rhashtable_params bch_btree_cache_params = { -+ .head_offset = offsetof(struct btree, hash), -+ .key_offset = offsetof(struct btree, hash_val), -+ .key_len = sizeof(u64), -+ .obj_cmpfn = bch2_btree_cache_cmp_fn, -+}; -+ -+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -+{ -+ BUG_ON(b->data || b->aux_data); -+ -+ b->data = kvpmalloc(btree_bytes(c), gfp); -+ if (!b->data) -+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc; -+#ifdef __KERNEL__ -+ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); -+#else -+ b->aux_data = mmap(NULL, btree_aux_data_bytes(b), -+ PROT_READ|PROT_WRITE|PROT_EXEC, -+ MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); -+ if (b->aux_data == MAP_FAILED) -+ b->aux_data = NULL; -+#endif -+ if (!b->aux_data) { -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ return -BCH_ERR_ENOMEM_btree_node_mem_alloc; -+ } -+ -+ return 0; -+} -+ -+static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -+{ -+ struct btree *b; -+ -+ b = kzalloc(sizeof(struct btree), gfp); -+ if (!b) -+ return NULL; -+ -+ bkey_btree_ptr_init(&b->key); -+ INIT_LIST_HEAD(&b->list); -+ INIT_LIST_HEAD(&b->write_blocked); -+ b->byte_order = ilog2(btree_bytes(c)); -+ return b; -+} -+ -+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ b = __btree_node_mem_alloc(c, GFP_KERNEL); -+ if (!b) -+ return NULL; -+ -+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { -+ kfree(b); -+ return NULL; -+ } -+ -+ bch2_btree_lock_init(&b->c, 0); -+ -+ bc->used++; -+ list_add(&b->list, &bc->freeable); -+ return b; -+} -+ -+/* Btree in memory cache - hash table */ -+ -+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -+{ -+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); -+ -+ BUG_ON(ret); -+ -+ /* Cause future lookups for this node to fail: */ -+ b->hash_val = 0; -+} -+ -+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -+{ -+ BUG_ON(b->hash_val); -+ b->hash_val = btree_ptr_hash_val(&b->key); -+ -+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, -+ bch_btree_cache_params); -+} -+ -+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, -+ unsigned level, enum btree_id id) -+{ -+ int ret; -+ -+ b->c.level = level; -+ b->c.btree_id = id; -+ -+ mutex_lock(&bc->lock); -+ ret = __bch2_btree_node_hash_insert(bc, b); -+ if (!ret) -+ list_add_tail(&b->list, &bc->live); -+ mutex_unlock(&bc->lock); -+ -+ return ret; -+} -+ -+__flatten -+static inline struct btree *btree_cache_find(struct btree_cache *bc, -+ const struct bkey_i *k) -+{ -+ u64 v = btree_ptr_hash_val(k); -+ -+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -+} -+ -+/* -+ * this version is for btree nodes that have already been freed (we're not -+ * reaping a real btree node) -+ */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ int ret = 0; -+ -+ lockdep_assert_held(&bc->lock); -+wait_on_io: -+ if (b->flags & ((1U << BTREE_NODE_dirty)| -+ (1U << BTREE_NODE_read_in_flight)| -+ (1U << BTREE_NODE_write_in_flight))) { -+ if (!flush) { -+ if (btree_node_dirty(b)) -+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty); -+ else if (btree_node_read_in_flight(b)) -+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); -+ else if (btree_node_write_in_flight(b)) -+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ -+ /* XXX: waiting on IO with btree cache lock held */ -+ bch2_btree_node_wait_on_read(b); -+ bch2_btree_node_wait_on_write(b); -+ } -+ -+ if (!six_trylock_intent(&b->c.lock)) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); -+ return -BCH_ERR_ENOMEM_btree_node_reclaim; -+ } -+ -+ if (!six_trylock_write(&b->c.lock)) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); -+ goto out_unlock_intent; -+ } -+ -+ /* recheck under lock */ -+ if (b->flags & ((1U << BTREE_NODE_read_in_flight)| -+ (1U << BTREE_NODE_write_in_flight))) { -+ if (!flush) { -+ if (btree_node_read_in_flight(b)) -+ BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); -+ else if (btree_node_write_in_flight(b)) -+ BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); -+ goto out_unlock; -+ } -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ goto wait_on_io; -+ } -+ -+ if (btree_node_noevict(b)) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(noevict); -+ goto out_unlock; -+ } -+ if (btree_node_write_blocked(b)) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); -+ goto out_unlock; -+ } -+ if (btree_node_will_make_reachable(b)) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); -+ goto out_unlock; -+ } -+ -+ if (btree_node_dirty(b)) { -+ if (!flush) { -+ BTREE_CACHE_NOT_FREED_INCREMENT(dirty); -+ goto out_unlock; -+ } -+ /* -+ * Using the underscore version because we don't want to compact -+ * bsets after the write, since this node is about to be evicted -+ * - unless btree verify mode is enabled, since it runs out of -+ * the post write cleanup: -+ */ -+ if (bch2_verify_btree_ondisk) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent, -+ BTREE_WRITE_cache_reclaim); -+ else -+ __bch2_btree_node_write(c, b, -+ BTREE_WRITE_cache_reclaim); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ goto wait_on_io; -+ } -+out: -+ if (b->hash_val && !ret) -+ trace_and_count(c, btree_cache_reap, c, b); -+ return ret; -+out_unlock: -+ six_unlock_write(&b->c.lock); -+out_unlock_intent: -+ six_unlock_intent(&b->c.lock); -+ ret = -BCH_ERR_ENOMEM_btree_node_reclaim; -+ goto out; -+} -+ -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) -+{ -+ return __btree_node_reclaim(c, b, false, shrinker_counter); -+} -+ -+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, true, false); -+} -+ -+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b, *t; -+ unsigned long nr = sc->nr_to_scan; -+ unsigned long can_free = 0; -+ unsigned long freed = 0; -+ unsigned long touched = 0; -+ unsigned i, flags; -+ unsigned long ret = SHRINK_STOP; -+ bool trigger_writes = atomic_read(&bc->dirty) + nr >= -+ bc->used * 3 / 4; -+ -+ if (bch2_btree_shrinker_disabled) -+ return SHRINK_STOP; -+ -+ mutex_lock(&bc->lock); -+ flags = memalloc_nofs_save(); -+ -+ /* -+ * It's _really_ critical that we don't free too many btree nodes - we -+ * have to always leave ourselves a reserve. The reserve is how we -+ * guarantee that allocating memory for a new btree node can always -+ * succeed, so that inserting keys into the btree can always succeed and -+ * IO can always make forward progress: -+ */ -+ can_free = btree_cache_can_free(bc); -+ nr = min_t(unsigned long, nr, can_free); -+ -+ i = 0; -+ list_for_each_entry_safe(b, t, &bc->freeable, list) { -+ /* -+ * Leave a few nodes on the freeable list, so that a btree split -+ * won't have to hit the system allocator: -+ */ -+ if (++i <= 3) -+ continue; -+ -+ touched++; -+ -+ if (touched >= nr) -+ goto out; -+ -+ if (!btree_node_reclaim(c, b, true)) { -+ btree_node_data_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ freed++; -+ bc->freed++; -+ } -+ } -+restart: -+ list_for_each_entry_safe(b, t, &bc->live, list) { -+ touched++; -+ -+ if (btree_node_accessed(b)) { -+ clear_btree_node_accessed(b); -+ bc->not_freed_access_bit++; -+ } else if (!btree_node_reclaim(c, b, true)) { -+ freed++; -+ btree_node_data_free(c, b); -+ bc->freed++; -+ -+ bch2_btree_node_hash_remove(bc, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ if (freed == nr) -+ goto out_rotate; -+ } else if (trigger_writes && -+ btree_node_dirty(b) && -+ !btree_node_will_make_reachable(b) && -+ !btree_node_write_blocked(b) && -+ six_trylock_read(&b->c.lock)) { -+ list_move(&bc->live, &b->list); -+ mutex_unlock(&bc->lock); -+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); -+ six_unlock_read(&b->c.lock); -+ if (touched >= nr) -+ goto out_nounlock; -+ mutex_lock(&bc->lock); -+ goto restart; -+ } -+ -+ if (touched >= nr) -+ break; -+ } -+out_rotate: -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+out: -+ mutex_unlock(&bc->lock); -+out_nounlock: -+ ret = freed; -+ memalloc_nofs_restore(flags); -+ trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); -+ return ret; -+} -+ -+static unsigned long bch2_btree_cache_count(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (bch2_btree_shrinker_disabled) -+ return 0; -+ -+ return btree_cache_can_free(bc); -+} -+ -+static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ char *cbuf; -+ size_t buflen = seq_buf_get_buf(s, &cbuf); -+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); -+ -+ bch2_btree_cache_to_text(&out, &c->btree_cache); -+ seq_buf_commit(s, out.pos); -+} -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ unsigned i, flags; -+ -+ unregister_shrinker(&bc->shrink); -+ -+ /* vfree() can allocate memory: */ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+ if (c->verify_data) -+ list_move(&c->verify_data->list, &bc->live); -+ -+ kvpfree(c->verify_ondisk, btree_bytes(c)); -+ -+ for (i = 0; i < btree_id_nr_alive(c); i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (r->b) -+ list_add(&r->b->list, &bc->live); -+ } -+ -+ list_splice(&bc->freeable, &bc->live); -+ -+ while (!list_empty(&bc->live)) { -+ b = list_first_entry(&bc->live, struct btree, list); -+ -+ BUG_ON(btree_node_read_in_flight(b) || -+ btree_node_write_in_flight(b)); -+ -+ if (btree_node_dirty(b)) -+ bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty_acct(c, b); -+ -+ btree_node_data_free(c, b); -+ } -+ -+ BUG_ON(atomic_read(&c->btree_cache.dirty)); -+ -+ list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); -+ -+ while (!list_empty(&bc->freed_nonpcpu)) { -+ b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); -+ list_del(&b->list); -+ six_lock_exit(&b->c.lock); -+ kfree(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ -+ if (bc->table_init_done) -+ rhashtable_destroy(&bc->table); -+} -+ -+int bch2_fs_btree_cache_init(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ unsigned i; -+ int ret = 0; -+ -+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); -+ if (ret) -+ goto err; -+ -+ bc->table_init_done = true; -+ -+ bch2_recalc_btree_reserve(c); -+ -+ for (i = 0; i < bc->reserve; i++) -+ if (!__bch2_btree_node_mem_alloc(c)) -+ goto err; -+ -+ list_splice_init(&bc->live, &bc->freeable); -+ -+ mutex_init(&c->verify_lock); -+ -+ bc->shrink.count_objects = bch2_btree_cache_count; -+ bc->shrink.scan_objects = bch2_btree_cache_scan; -+ bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; -+ bc->shrink.seeks = 4; -+ ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ return -BCH_ERR_ENOMEM_fs_btree_cache_init; -+} -+ -+void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -+{ -+ mutex_init(&bc->lock); -+ INIT_LIST_HEAD(&bc->live); -+ INIT_LIST_HEAD(&bc->freeable); -+ INIT_LIST_HEAD(&bc->freed_pcpu); -+ INIT_LIST_HEAD(&bc->freed_nonpcpu); -+} -+ -+/* -+ * We can only have one thread cannibalizing other cached btree nodes at a time, -+ * or we'll deadlock. We use an open coded mutex to ensure that, which a -+ * cannibalize_bucket() will take. This means every time we unlock the root of -+ * the btree, we need to release this lock if we have it held. -+ */ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (bc->alloc_lock == current) { -+ trace_and_count(c, btree_cache_cannibalize_unlock, c); -+ bc->alloc_lock = NULL; -+ closure_wake_up(&bc->alloc_wait); -+ } -+} -+ -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct task_struct *old; -+ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) -+ goto success; -+ -+ if (!cl) { -+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c); -+ return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; -+ } -+ -+ closure_wait(&bc->alloc_wait, cl); -+ -+ /* Try again, after adding ourselves to waitlist */ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) { -+ /* We raced */ -+ closure_wake_up(&bc->alloc_wait); -+ goto success; -+ } -+ -+ trace_and_count(c, btree_cache_cannibalize_lock_fail, c); -+ return -BCH_ERR_btree_cache_cannibalize_lock_blocked; -+ -+success: -+ trace_and_count(c, btree_cache_cannibalize_lock, c); -+ return 0; -+} -+ -+static struct btree *btree_node_cannibalize(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_reclaim(c, b, false)) -+ return b; -+ -+ while (1) { -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_write_and_reclaim(c, b)) -+ return b; -+ -+ /* -+ * Rare case: all nodes were intent-locked. -+ * Just busy-wait. -+ */ -+ WARN_ONCE(1, "btree cache cannibalize failed\n"); -+ cond_resched(); -+ } -+} -+ -+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct list_head *freed = pcpu_read_locks -+ ? &bc->freed_pcpu -+ : &bc->freed_nonpcpu; -+ struct btree *b, *b2; -+ u64 start_time = local_clock(); -+ unsigned flags; -+ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+ /* -+ * We never free struct btree itself, just the memory that holds the on -+ * disk node. Check the freed list before allocating a new one: -+ */ -+ list_for_each_entry(b, freed, list) -+ if (!btree_node_reclaim(c, b, false)) { -+ list_del_init(&b->list); -+ goto got_node; -+ } -+ -+ b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); -+ if (!b) { -+ mutex_unlock(&bc->lock); -+ bch2_trans_unlock(trans); -+ b = __btree_node_mem_alloc(c, GFP_KERNEL); -+ if (!b) -+ goto err; -+ mutex_lock(&bc->lock); -+ } -+ -+ bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); -+ -+ BUG_ON(!six_trylock_intent(&b->c.lock)); -+ BUG_ON(!six_trylock_write(&b->c.lock)); -+got_node: -+ -+ /* -+ * btree_free() doesn't free memory; it sticks the node on the end of -+ * the list. Check if there's any freed nodes there: -+ */ -+ list_for_each_entry(b2, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b2, false)) { -+ swap(b->data, b2->data); -+ swap(b->aux_data, b2->aux_data); -+ btree_node_to_freedlist(bc, b2); -+ six_unlock_write(&b2->c.lock); -+ six_unlock_intent(&b2->c.lock); -+ goto got_mem; -+ } -+ -+ mutex_unlock(&bc->lock); -+ -+ if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { -+ bch2_trans_unlock(trans); -+ if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) -+ goto err; -+ } -+ -+ mutex_lock(&bc->lock); -+ bc->used++; -+got_mem: -+ mutex_unlock(&bc->lock); -+ -+ BUG_ON(btree_node_hashed(b)); -+ BUG_ON(btree_node_dirty(b)); -+ BUG_ON(btree_node_write_in_flight(b)); -+out: -+ b->flags = 0; -+ b->written = 0; -+ b->nsets = 0; -+ b->sib_u64s[0] = 0; -+ b->sib_u64s[1] = 0; -+ b->whiteout_u64s = 0; -+ bch2_btree_keys_init(b); -+ set_btree_node_accessed(b); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], -+ start_time); -+ -+ memalloc_nofs_restore(flags); -+ return b; -+err: -+ mutex_lock(&bc->lock); -+ -+ /* Try to cannibalize another cached btree node: */ -+ if (bc->alloc_lock == current) { -+ b2 = btree_node_cannibalize(c); -+ clear_btree_node_just_written(b2); -+ bch2_btree_node_hash_remove(bc, b2); -+ -+ if (b) { -+ swap(b->data, b2->data); -+ swap(b->aux_data, b2->aux_data); -+ btree_node_to_freedlist(bc, b2); -+ six_unlock_write(&b2->c.lock); -+ six_unlock_intent(&b2->c.lock); -+ } else { -+ b = b2; -+ list_del_init(&b->list); -+ } -+ -+ mutex_unlock(&bc->lock); -+ -+ trace_and_count(c, btree_cache_cannibalize, c); -+ goto out; -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); -+} -+ -+/* Slowpath, don't want it inlined into btree_iter_traverse() */ -+static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, -+ struct btree_path *path, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level, -+ enum six_lock_type lock_type, -+ bool sync) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ u32 seq; -+ -+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); -+ /* -+ * Parent node must be locked, else we could read in a btree node that's -+ * been freed: -+ */ -+ if (path && !bch2_btree_node_relock(trans, path, level + 1)) { -+ trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); -+ } -+ -+ b = bch2_btree_node_mem_alloc(trans, level != 0); -+ -+ if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { -+ trans->memory_allocation_failure = true; -+ trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); -+ } -+ -+ if (IS_ERR(b)) -+ return b; -+ -+ /* -+ * Btree nodes read in from disk should not have the accessed bit set -+ * initially, so that linear scans don't thrash the cache: -+ */ -+ clear_btree_node_accessed(b); -+ -+ bkey_copy(&b->key, k); -+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { -+ /* raced with another fill: */ -+ -+ /* mark as unhashed... */ -+ b->hash_val = 0; -+ -+ mutex_lock(&bc->lock); -+ list_add(&b->list, &bc->freeable); -+ mutex_unlock(&bc->lock); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ set_btree_node_read_in_flight(b); -+ -+ six_unlock_write(&b->c.lock); -+ seq = six_lock_seq(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ /* Unlock before doing IO: */ -+ if (trans && sync) -+ bch2_trans_unlock_noassert(trans); -+ -+ bch2_btree_node_read(c, b, sync); -+ -+ if (!sync) -+ return NULL; -+ -+ if (path) { -+ int ret = bch2_trans_relock(trans) ?: -+ bch2_btree_path_relock_intent(trans, path); -+ if (ret) { -+ BUG_ON(!trans->restarted); -+ return ERR_PTR(ret); -+ } -+ } -+ -+ if (!six_relock_type(&b->c.lock, lock_type, seq)) { -+ if (path) -+ trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); -+ } -+ -+ return b; -+} -+ -+static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) -+{ -+ struct printbuf buf = PRINTBUF; -+ -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) -+ return; -+ -+ prt_printf(&buf, -+ "btree node header doesn't match ptr\n" -+ "btree %s level %u\n" -+ "ptr: ", -+ bch2_btree_ids[b->c.btree_id], b->c.level); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ -+ prt_printf(&buf, "\nheader: btree %s level %llu\n" -+ "min ", -+ bch2_btree_ids[BTREE_NODE_ID(b->data)], -+ BTREE_NODE_LEVEL(b->data)); -+ bch2_bpos_to_text(&buf, b->data->min_key); -+ -+ prt_printf(&buf, "\nmax "); -+ bch2_bpos_to_text(&buf, b->data->max_key); -+ -+ bch2_fs_inconsistent(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+} -+ -+static inline void btree_check_header(struct bch_fs *c, struct btree *b) -+{ -+ if (b->c.btree_id != BTREE_NODE_ID(b->data) || -+ b->c.level != BTREE_NODE_LEVEL(b->data) || -+ !bpos_eq(b->data->max_key, b->key.k.p) || -+ (b->key.k.type == KEY_TYPE_btree_ptr_v2 && -+ !bpos_eq(b->data->min_key, -+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) -+ btree_bad_header(c, b); -+} -+ -+static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, -+ const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ bool need_relock = false; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ /* -+ * We must have the parent locked to call bch2_btree_node_fill(), -+ * else we could read in a btree node from disk that's been -+ * freed: -+ */ -+ b = bch2_btree_node_fill(trans, path, k, path->btree_id, -+ level, lock_type, true); -+ need_relock = true; -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+ if (btree_node_read_locked(path, level + 1)) -+ btree_node_unlock(trans, path, level + 1); -+ -+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ERR_PTR(ret); -+ -+ BUG_ON(ret); -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.level != level || -+ race_fault())) { -+ six_unlock_type(&b->c.lock, lock_type); -+ if (bch2_btree_node_relock(trans, path, level + 1)) -+ goto retry; -+ -+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ } -+ -+ if (unlikely(btree_node_read_in_flight(b))) { -+ u32 seq = six_lock_seq(&b->c.lock); -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ bch2_trans_unlock(trans); -+ need_relock = true; -+ -+ bch2_btree_node_wait_on_read(b); -+ -+ /* -+ * should_be_locked is not set on this path yet, so we need to -+ * relock it specifically: -+ */ -+ if (!six_relock_type(&b->c.lock, lock_type, seq)) -+ goto retry; -+ } -+ -+ if (unlikely(need_relock)) { -+ int ret = bch2_trans_relock(trans) ?: -+ bch2_btree_path_relock_intent(trans, path); -+ if (ret) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(ret); -+ } -+ } -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != path->btree_id); -+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); -+ btree_check_header(c, b); -+ -+ return b; -+} -+ -+/** -+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it -+ * in from disk if necessary. -+ * -+ * The btree node will have either a read or a write lock held, depending on -+ * the @write parameter. -+ */ -+struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, -+ const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b; -+ struct bset_tree *t; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ -+ /* -+ * Check b->hash_val _before_ calling btree_node_lock() - this might not -+ * be the node we want anymore, and trying to lock the wrong node could -+ * cause an unneccessary transaction restart: -+ */ -+ if (unlikely(!c->opts.btree_node_mem_ptr_optimization || -+ !b || -+ b->hash_val != btree_ptr_hash_val(k))) -+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); -+ -+ if (btree_node_read_locked(path, level + 1)) -+ btree_node_unlock(trans, path, level + 1); -+ -+ ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ERR_PTR(ret); -+ -+ BUG_ON(ret); -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.level != level || -+ race_fault())) { -+ six_unlock_type(&b->c.lock, lock_type); -+ if (bch2_btree_node_relock(trans, path, level + 1)) -+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); -+ -+ trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); -+ } -+ -+ if (unlikely(btree_node_read_in_flight(b))) { -+ u32 seq = six_lock_seq(&b->c.lock); -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ bch2_trans_unlock(trans); -+ -+ bch2_btree_node_wait_on_read(b); -+ -+ /* -+ * should_be_locked is not set on this path yet, so we need to -+ * relock it specifically: -+ */ -+ if (trans) { -+ int ret = bch2_trans_relock(trans) ?: -+ bch2_btree_path_relock_intent(trans, path); -+ if (ret) { -+ BUG_ON(!trans->restarted); -+ return ERR_PTR(ret); -+ } -+ } -+ -+ if (!six_relock_type(&b->c.lock, lock_type, seq)) -+ return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); -+ } -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != path->btree_id); -+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); -+ btree_check_header(c, b); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level, -+ bool nofill) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ if (c->opts.btree_node_mem_ptr_optimization) { -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+ } -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ if (nofill) -+ goto out; -+ -+ b = bch2_btree_node_fill(trans, NULL, k, btree_id, -+ level, SIX_LOCK_read, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b) && -+ !bch2_btree_cache_cannibalize_lock(c, NULL)) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ goto out; -+ } else { -+lock_node: -+ ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ERR_PTR(ret); -+ -+ BUG_ON(ret); -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.btree_id != btree_id || -+ b->c.level != level)) { -+ six_unlock_read(&b->c.lock); -+ goto retry; -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ __bch2_btree_node_wait_on_read(b); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_read(&b->c.lock); -+ b = ERR_PTR(-EIO); -+ goto out; -+ } -+ -+ EBUG_ON(b->c.btree_id != btree_id); -+ EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); -+ btree_check_header(c, b); -+out: -+ bch2_btree_cache_cannibalize_unlock(c); -+ return b; -+} -+ -+int bch2_btree_node_prefetch(struct btree_trans *trans, -+ struct btree_path *path, -+ const struct bkey_i *k, -+ enum btree_id btree_id, unsigned level) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(trans && !btree_node_locked(path, level + 1)); -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_cache_find(bc, k); -+ if (b) -+ return 0; -+ -+ b = bch2_btree_node_fill(trans, path, k, btree_id, -+ level, SIX_LOCK_read, false); -+ return PTR_ERR_OR_ZERO(b); -+} -+ -+void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ b = btree_cache_find(bc, k); -+ if (!b) -+ return; -+wait_on_io: -+ /* not allowed to wait on io with btree locks held: */ -+ -+ /* XXX we're called from btree_gc which will be holding other btree -+ * nodes locked -+ */ -+ __bch2_btree_node_wait_on_read(b); -+ __bch2_btree_node_wait_on_write(b); -+ -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); -+ -+ if (btree_node_dirty(b)) { -+ __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ goto wait_on_io; -+ } -+ -+ BUG_ON(btree_node_dirty(b)); -+ -+ mutex_lock(&bc->lock); -+ btree_node_data_free(c, b); -+ bch2_btree_node_hash_remove(bc, b); -+ mutex_unlock(&bc->lock); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct btree *b) -+{ -+ struct bset_stats stats; -+ -+ memset(&stats, 0, sizeof(stats)); -+ -+ bch2_btree_keys_stats(b, &stats); -+ -+ prt_printf(out, "l %u ", b->c.level); -+ bch2_bpos_to_text(out, b->data->min_key); -+ prt_printf(out, " - "); -+ bch2_bpos_to_text(out, b->data->max_key); -+ prt_printf(out, ":\n" -+ " ptrs: "); -+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ prt_newline(out); -+ -+ prt_printf(out, -+ " format: "); -+ bch2_bkey_format_to_text(out, &b->format); -+ -+ prt_printf(out, -+ " unpack fn len: %u\n" -+ " bytes used %zu/%zu (%zu%% full)\n" -+ " sib u64s: %u, %u (merge threshold %u)\n" -+ " nr packed keys %u\n" -+ " nr unpacked keys %u\n" -+ " floats %zu\n" -+ " failed unpacked %zu\n", -+ b->unpack_fn_len, -+ b->nr.live_u64s * sizeof(u64), -+ btree_bytes(c) - sizeof(struct btree_node), -+ b->nr.live_u64s * 100 / btree_max_u64s(c), -+ b->sib_u64s[0], -+ b->sib_u64s[1], -+ c->btree_foreground_merge_threshold, -+ b->nr.packed_keys, -+ b->nr.unpacked_keys, -+ stats.floats, -+ stats.failed); -+} -+ -+void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) -+{ -+ prt_printf(out, "nr nodes:\t\t%u\n", bc->used); -+ prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); -+ prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); -+ -+ prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); -+ prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); -+ prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); -+ prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); -+ prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); -+ prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); -+ prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); -+ prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); -+ prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); -+ prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); -+ -+} -diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h -new file mode 100644 -index 000000000..00c9b9218 ---- /dev/null -+++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,130 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_CACHE_H -+#define _BCACHEFS_BTREE_CACHE_H -+ -+#include "bcachefs.h" -+#include "btree_types.h" -+#include "bkey_methods.h" -+ -+extern const char * const bch2_btree_node_flags[]; -+ -+struct btree_iter; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *); -+ -+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, -+ unsigned, enum btree_id); -+ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); -+ -+struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); -+struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); -+ -+struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, -+ const struct bkey_i *, unsigned, -+ enum six_lock_type, unsigned long); -+ -+struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, -+ enum btree_id, unsigned, bool); -+ -+int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, -+ const struct bkey_i *, enum btree_id, unsigned); -+ -+void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *); -+int bch2_fs_btree_cache_init(struct bch_fs *); -+void bch2_fs_btree_cache_init_early(struct btree_cache *); -+ -+static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -+{ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); -+ case KEY_TYPE_btree_ptr_v2: -+ /* -+ * The cast/deref is only necessary to avoid sparse endianness -+ * warnings: -+ */ -+ return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); -+ default: -+ return 0; -+ } -+} -+ -+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -+{ -+ return k->k.type == KEY_TYPE_btree_ptr_v2 -+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr -+ : NULL; -+} -+ -+/* is btree node in hash table? */ -+static inline bool btree_node_hashed(struct btree *b) -+{ -+ return b->hash_val != 0; -+} -+ -+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ -+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ -+ &(_c)->btree_cache.table), \ -+ _iter = 0; _iter < (_tbl)->size; _iter++) \ -+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -+ -+static inline size_t btree_bytes(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size; -+} -+ -+static inline size_t btree_max_u64s(struct bch_fs *c) -+{ -+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); -+} -+ -+static inline size_t btree_pages(struct bch_fs *c) -+{ -+ return btree_bytes(c) / PAGE_SIZE; -+} -+ -+static inline unsigned btree_blocks(struct bch_fs *c) -+{ -+ return btree_sectors(c) >> c->block_bits; -+} -+ -+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) -+ -+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) -+ -+static inline unsigned btree_id_nr_alive(struct bch_fs *c) -+{ -+ return BTREE_ID_NR + c->btree_roots_extra.nr; -+} -+ -+static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) -+{ -+ if (likely(id < BTREE_ID_NR)) { -+ return &c->btree_roots_known[id]; -+ } else { -+ unsigned idx = id - BTREE_ID_NR; -+ -+ EBUG_ON(idx >= c->btree_roots_extra.nr); -+ return &c->btree_roots_extra.data[idx]; -+ } -+} -+ -+static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) -+{ -+ return bch2_btree_id_root(c, b->c.btree_id)->b; -+} -+ -+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, -+ const struct btree *); -+void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); -+ -+#endif /* _BCACHEFS_BTREE_CACHE_H */ -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -new file mode 100644 -index 000000000..83dcd9eb2 ---- /dev/null -+++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,2127 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * Copyright (C) 2014 Datera Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "bkey_buf.h" -+#include "btree_journal_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "recovery.h" -+#include "reflink.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define DROP_THIS_NODE 10 -+#define DROP_PREV_NODE 11 -+ -+static bool should_restart_for_topology_repair(struct bch_fs *c) -+{ -+ return c->opts.fix_errors != FSCK_FIX_no && -+ !(c->recovery_passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology)); -+} -+ -+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ preempt_disable(); -+ write_seqcount_begin(&c->gc_pos_lock); -+ c->gc_pos = new_pos; -+ write_seqcount_end(&c->gc_pos_lock); -+ preempt_enable(); -+} -+ -+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); -+ __gc_pos_set(c, new_pos); -+} -+ -+/* -+ * Missing: if an interior btree node is empty, we need to do something - -+ * perhaps just kill it -+ */ -+static int bch2_gc_check_topology(struct bch_fs *c, -+ struct btree *b, -+ struct bkey_buf *prev, -+ struct bkey_buf cur, -+ bool is_last) -+{ -+ struct bpos node_start = b->data->min_key; -+ struct bpos node_end = b->data->max_key; -+ struct bpos expected_start = bkey_deleted(&prev->k->k) -+ ? node_start -+ : bpos_successor(prev->k->k.p); -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ int ret = 0; -+ -+ if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); -+ -+ if (!bpos_eq(expected_start, bp->v.min_key)) { -+ bch2_topology_error(c); -+ -+ if (bkey_deleted(&prev->k->k)) { -+ prt_printf(&buf1, "start of node: "); -+ bch2_bpos_to_text(&buf1, node_start); -+ } else { -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); -+ } -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); -+ -+ if (__fsck_err(c, -+ FSCK_CAN_FIX| -+ FSCK_CAN_IGNORE| -+ FSCK_NO_RATELIMIT, -+ "btree node with incorrect min_key at btree %s level %u:\n" -+ " prev %s\n" -+ " cur %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf) && -+ should_restart_for_topology_repair(c)) { -+ bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); -+ goto err; -+ } else { -+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); -+ } -+ } -+ } -+ -+ if (is_last && !bpos_eq(cur.k->k.p, node_end)) { -+ bch2_topology_error(c); -+ -+ printbuf_reset(&buf1); -+ printbuf_reset(&buf2); -+ -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); -+ bch2_bpos_to_text(&buf2, node_end); -+ -+ if (__fsck_err(c, -+ FSCK_CAN_FIX| -+ FSCK_CAN_IGNORE| -+ FSCK_NO_RATELIMIT, -+ "btree node with incorrect max_key at btree %s level %u:\n" -+ " %s\n" -+ " expected %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf) && -+ should_restart_for_topology_repair(c)) { -+ bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); -+ goto err; -+ } else { -+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); -+ } -+ } -+ -+ bch2_bkey_buf_copy(prev, c, cur.k); -+err: -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) -+{ -+ switch (b->key.k.type) { -+ case KEY_TYPE_btree_ptr: { -+ struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); -+ -+ dst->k.p = src->k.p; -+ dst->v.mem_ptr = 0; -+ dst->v.seq = b->data->keys.seq; -+ dst->v.sectors_written = 0; -+ dst->v.flags = 0; -+ dst->v.min_key = b->data->min_key; -+ set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); -+ memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); -+ break; -+ } -+ case KEY_TYPE_btree_ptr_v2: -+ bkey_copy(&dst->k_i, &b->key); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static void bch2_btree_node_update_key_early(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b; -+ struct bkey_buf tmp; -+ int ret; -+ -+ bch2_bkey_buf_init(&tmp); -+ bch2_bkey_buf_reassemble(&tmp, c, old); -+ -+ b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); -+ if (!IS_ERR_OR_NULL(b)) { -+ mutex_lock(&c->btree_cache.lock); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, new); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ -+ mutex_unlock(&c->btree_cache.lock); -+ six_unlock_read(&b->c.lock); -+ } -+ -+ bch2_bkey_buf_exit(&tmp, c); -+} -+ -+static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) -+{ -+ struct bkey_i_btree_ptr_v2 *new; -+ int ret; -+ -+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); -+ if (!new) -+ return -BCH_ERR_ENOMEM_gc_repair_key; -+ -+ btree_ptr_to_v2(b, new); -+ b->data->min_key = new_min; -+ new->v.min_key = new_min; -+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); -+ -+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); -+ if (ret) { -+ kfree(new); -+ return ret; -+ } -+ -+ bch2_btree_node_drop_keys_outside_node(b); -+ bkey_copy(&b->key, &new->k_i); -+ return 0; -+} -+ -+static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) -+{ -+ struct bkey_i_btree_ptr_v2 *new; -+ int ret; -+ -+ ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); -+ if (ret) -+ return ret; -+ -+ new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); -+ if (!new) -+ return -BCH_ERR_ENOMEM_gc_repair_key; -+ -+ btree_ptr_to_v2(b, new); -+ b->data->max_key = new_max; -+ new->k.p = new_max; -+ SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); -+ -+ ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); -+ if (ret) { -+ kfree(new); -+ return ret; -+ } -+ -+ bch2_btree_node_drop_keys_outside_node(b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, &new->k_i); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ return 0; -+} -+ -+static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, -+ struct btree *prev, struct btree *cur) -+{ -+ struct bpos expected_start = !prev -+ ? b->data->min_key -+ : bpos_successor(prev->key.k.p); -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ int ret = 0; -+ -+ if (!prev) { -+ prt_printf(&buf1, "start of node: "); -+ bch2_bpos_to_text(&buf1, b->data->min_key); -+ } else { -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); -+ } -+ -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); -+ -+ if (prev && -+ bpos_gt(expected_start, cur->data->min_key) && -+ BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { -+ /* cur overwrites prev: */ -+ -+ if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, -+ cur->data->min_key), c, -+ "btree node overwritten by next node at btree %s level %u:\n" -+ " node %s\n" -+ " next %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf)) { -+ ret = DROP_PREV_NODE; -+ goto out; -+ } -+ -+ if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, -+ bpos_predecessor(cur->data->min_key)), c, -+ "btree node with incorrect max_key at btree %s level %u:\n" -+ " node %s\n" -+ " next %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf)) -+ ret = set_node_max(c, prev, -+ bpos_predecessor(cur->data->min_key)); -+ } else { -+ /* prev overwrites cur: */ -+ -+ if (mustfix_fsck_err_on(bpos_ge(expected_start, -+ cur->data->max_key), c, -+ "btree node overwritten by prev node at btree %s level %u:\n" -+ " prev %s\n" -+ " node %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf)) { -+ ret = DROP_THIS_NODE; -+ goto out; -+ } -+ -+ if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, -+ "btree node with incorrect min_key at btree %s level %u:\n" -+ " prev %s\n" -+ " node %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf)) -+ ret = set_node_min(c, cur, expected_start); -+ } -+out: -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+static int btree_repair_node_end(struct bch_fs *c, struct btree *b, -+ struct btree *child) -+{ -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ int ret = 0; -+ -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); -+ bch2_bpos_to_text(&buf2, b->key.k.p); -+ -+ if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, -+ "btree node with incorrect max_key at btree %s level %u:\n" -+ " %s\n" -+ " expected %s", -+ bch2_btree_ids[b->c.btree_id], b->c.level, -+ buf1.buf, buf2.buf)) { -+ ret = set_node_max(c, child, b->key.k.p); -+ if (ret) -+ goto err; -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf prev_k, cur_k; -+ struct btree *prev = NULL, *cur = NULL; -+ bool have_child, dropped_children = false; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (!b->c.level) -+ return 0; -+again: -+ prev = NULL; -+ have_child = dropped_children = false; -+ bch2_bkey_buf_init(&prev_k); -+ bch2_bkey_buf_init(&cur_k); -+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ BUG_ON(bpos_lt(k.k->p, b->data->min_key)); -+ BUG_ON(bpos_gt(k.k->p, b->data->max_key)); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ bch2_bkey_buf_reassemble(&cur_k, c, k); -+ -+ cur = bch2_btree_node_get_noiter(trans, cur_k.k, -+ b->c.btree_id, b->c.level - 1, -+ false); -+ ret = PTR_ERR_OR_ZERO(cur); -+ -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); -+ -+ if (mustfix_fsck_err_on(ret == -EIO, c, -+ "Topology repair: unreadable btree node at btree %s level %u:\n" -+ " %s", -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level - 1, -+ buf.buf)) { -+ bch2_btree_node_evict(trans, cur_k.k); -+ ret = bch2_journal_key_delete(c, b->c.btree_id, -+ b->c.level, cur_k.k->k.p); -+ cur = NULL; -+ if (ret) -+ break; -+ continue; -+ } -+ -+ if (ret) { -+ bch_err_msg(c, ret, "getting btree node"); -+ break; -+ } -+ -+ ret = btree_repair_node_boundaries(c, b, prev, cur); -+ -+ if (ret == DROP_THIS_NODE) { -+ six_unlock_read(&cur->c.lock); -+ bch2_btree_node_evict(trans, cur_k.k); -+ ret = bch2_journal_key_delete(c, b->c.btree_id, -+ b->c.level, cur_k.k->k.p); -+ cur = NULL; -+ if (ret) -+ break; -+ continue; -+ } -+ -+ if (prev) -+ six_unlock_read(&prev->c.lock); -+ prev = NULL; -+ -+ if (ret == DROP_PREV_NODE) { -+ bch2_btree_node_evict(trans, prev_k.k); -+ ret = bch2_journal_key_delete(c, b->c.btree_id, -+ b->c.level, prev_k.k->k.p); -+ if (ret) -+ break; -+ -+ bch2_btree_and_journal_iter_exit(&iter); -+ bch2_bkey_buf_exit(&prev_k, c); -+ bch2_bkey_buf_exit(&cur_k, c); -+ goto again; -+ } else if (ret) -+ break; -+ -+ prev = cur; -+ cur = NULL; -+ bch2_bkey_buf_copy(&prev_k, c, cur_k.k); -+ } -+ -+ if (!ret && !IS_ERR_OR_NULL(prev)) { -+ BUG_ON(cur); -+ ret = btree_repair_node_end(c, b, prev); -+ } -+ -+ if (!IS_ERR_OR_NULL(prev)) -+ six_unlock_read(&prev->c.lock); -+ prev = NULL; -+ if (!IS_ERR_OR_NULL(cur)) -+ six_unlock_read(&cur->c.lock); -+ cur = NULL; -+ -+ if (ret) -+ goto err; -+ -+ bch2_btree_and_journal_iter_exit(&iter); -+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ bch2_bkey_buf_reassemble(&cur_k, c, k); -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ cur = bch2_btree_node_get_noiter(trans, cur_k.k, -+ b->c.btree_id, b->c.level - 1, -+ false); -+ ret = PTR_ERR_OR_ZERO(cur); -+ -+ if (ret) { -+ bch_err_msg(c, ret, "getting btree node"); -+ goto err; -+ } -+ -+ ret = bch2_btree_repair_topology_recurse(trans, cur); -+ six_unlock_read(&cur->c.lock); -+ cur = NULL; -+ -+ if (ret == DROP_THIS_NODE) { -+ bch2_btree_node_evict(trans, cur_k.k); -+ ret = bch2_journal_key_delete(c, b->c.btree_id, -+ b->c.level, cur_k.k->k.p); -+ dropped_children = true; -+ } -+ -+ if (ret) -+ goto err; -+ -+ have_child = true; -+ } -+ -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ -+ if (mustfix_fsck_err_on(!have_child, c, -+ "empty interior btree node at btree %s level %u\n" -+ " %s", -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level, buf.buf)) -+ ret = DROP_THIS_NODE; -+err: -+fsck_err: -+ if (!IS_ERR_OR_NULL(prev)) -+ six_unlock_read(&prev->c.lock); -+ if (!IS_ERR_OR_NULL(cur)) -+ six_unlock_read(&cur->c.lock); -+ -+ bch2_btree_and_journal_iter_exit(&iter); -+ bch2_bkey_buf_exit(&prev_k, c); -+ bch2_bkey_buf_exit(&cur_k, c); -+ -+ if (!ret && dropped_children) -+ goto again; -+ -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_topology(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree *b; -+ unsigned i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (!r->alive) -+ continue; -+ -+ b = r->b; -+ if (btree_node_fake(b)) -+ continue; -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ ret = bch2_btree_repair_topology_recurse(&trans, b); -+ six_unlock_read(&b->c.lock); -+ -+ if (ret == DROP_THIS_NODE) { -+ bch_err(c, "empty btree root - repair unimplemented"); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, -+ unsigned level, bool is_root, -+ struct bkey_s_c *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p = { 0 }; -+ bool do_update = false; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ /* -+ * XXX -+ * use check_bucket_ref here -+ */ -+ bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); -+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); -+ -+ if (!g->gen_valid && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { -+ if (!p.ptr.cached) { -+ g->gen_valid = true; -+ g->gen = p.ptr.gen; -+ } else { -+ do_update = true; -+ } -+ } -+ -+ if (gen_cmp(p.ptr.gen, g->gen) > 0 && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, g->gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { -+ if (!p.ptr.cached) { -+ g->gen_valid = true; -+ g->gen = p.ptr.gen; -+ g->data_type = 0; -+ g->dirty_sectors = 0; -+ g->cached_sectors = 0; -+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); -+ } else { -+ do_update = true; -+ } -+ } -+ -+ if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, -+ bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) -+ do_update = true; -+ -+ if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && -+ (c->opts.reconstruct_alloc || -+ fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[ptr_data_type(k->k, &p.ptr)], -+ p.ptr.gen, g->gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) -+ do_update = true; -+ -+ if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) -+ continue; -+ -+ if (fsck_err_on(bucket_data_type(g->data_type) && -+ bucket_data_type(g->data_type) != data_type, c, -+ "bucket %u:%zu different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), -+ bch2_data_types[g->data_type], -+ bch2_data_types[data_type], -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { -+ if (data_type == BCH_DATA_btree) { -+ g->data_type = data_type; -+ set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); -+ } else { -+ do_update = true; -+ } -+ } -+ -+ if (p.has_ec) { -+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); -+ -+ if (fsck_err_on(!m || !m->alive, c, -+ "pointer to nonexistent stripe %llu\n" -+ "while marking %s", -+ (u64) p.ec.idx, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) -+ do_update = true; -+ -+ if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, -+ "pointer does not match stripe %llu\n" -+ "while marking %s", -+ (u64) p.ec.idx, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) -+ do_update = true; -+ } -+ } -+ -+ if (do_update) { -+ struct bkey_ptrs ptrs; -+ union bch_extent_entry *entry; -+ struct bch_extent_ptr *ptr; -+ struct bkey_i *new; -+ -+ if (is_root) { -+ bch_err(c, "cannot update btree roots yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); -+ if (!new) { -+ bch_err_msg(c, ret, "allocating new key"); -+ ret = -BCH_ERR_ENOMEM_gc_repair_key; -+ goto err; -+ } -+ -+ bkey_reassemble(new, *k); -+ -+ if (level) { -+ /* -+ * We don't want to drop btree node pointers - if the -+ * btree node isn't there anymore, the read path will -+ * sort it out: -+ */ -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_GC_BUCKET(ca, ptr); -+ -+ ptr->gen = g->gen; -+ } -+ } else { -+ bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_GC_BUCKET(ca, ptr); -+ enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); -+ -+ (ptr->cached && -+ (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || -+ (!ptr->cached && -+ gen_cmp(ptr->gen, g->gen) < 0) || -+ gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || -+ (g->data_type && -+ g->data_type != data_type); -+ })); -+again: -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { -+ struct gc_stripe *m = genradix_ptr(&c->gc_stripes, -+ entry->stripe_ptr.idx); -+ union bch_extent_entry *next_ptr; -+ -+ bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) -+ if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) -+ goto found; -+ next_ptr = NULL; -+found: -+ if (!next_ptr) { -+ bch_err(c, "aieee, found stripe ptr with no data ptr"); -+ continue; -+ } -+ -+ if (!m || !m->alive || -+ !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], -+ &next_ptr->ptr, -+ m->sectors)) { -+ bch2_bkey_extent_entry_drop(new, entry); -+ goto again; -+ } -+ } -+ } -+ } -+ -+ ret = bch2_journal_key_insert_take(c, btree_id, level, new); -+ if (ret) { -+ kfree(new); -+ goto err; -+ } -+ -+ if (level) -+ bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); -+ -+ if (0) { -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, *k); -+ bch_info(c, "updated %s", buf.buf); -+ -+ printbuf_reset(&buf); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); -+ bch_info(c, "new key %s", buf.buf); -+ } -+ -+ *k = bkey_i_to_s_c(new); -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* marking of btree keys/nodes: */ -+ -+static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, -+ unsigned level, bool is_root, -+ struct bkey_s_c *k, -+ bool initial) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey deleted = KEY(0, 0, 0); -+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; -+ unsigned flags = -+ BTREE_TRIGGER_GC| -+ (initial ? BTREE_TRIGGER_NOATOMIC : 0); -+ int ret = 0; -+ -+ deleted.p = k->k->p; -+ -+ if (initial) { -+ BUG_ON(bch2_journal_seq_verify && -+ k->k->version.lo > atomic64_read(&c->journal.seq)); -+ -+ ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); -+ if (ret) -+ goto err; -+ -+ if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, -+ "key version number higher than recorded: %llu > %llu", -+ k->k->version.lo, -+ atomic64_read(&c->key_version))) -+ atomic64_set(&c->key_version, k->k->version.lo); -+ } -+ -+ ret = commit_do(trans, NULL, NULL, 0, -+ bch2_mark_key(trans, btree_id, level, old, *k, flags)); -+fsck_err: -+err: -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_node_iter iter; -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ struct bkey_buf prev, cur; -+ int ret = 0; -+ -+ if (!btree_node_type_needs_gc(btree_node_type(b))) -+ return 0; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ bch2_bkey_buf_init(&prev); -+ bch2_bkey_buf_init(&cur); -+ bkey_init(&prev.k->k); -+ -+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, -+ &k, initial); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (b->c.level) { -+ bch2_bkey_buf_reassemble(&cur, c, k); -+ -+ ret = bch2_gc_check_topology(c, b, &prev, cur, -+ bch2_btree_node_iter_end(&iter)); -+ if (ret) -+ break; -+ } -+ } -+ -+ bch2_bkey_buf_exit(&cur, c); -+ bch2_bkey_buf_exit(&prev, c); -+ return ret; -+} -+ -+static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, -+ bool initial, bool metadata_only) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct btree *b; -+ unsigned depth = metadata_only ? 1 : 0; -+ int ret = 0; -+ -+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); -+ -+ __for_each_btree_node(trans, iter, btree_id, POS_MIN, -+ 0, depth, BTREE_ITER_PREFETCH, b, ret) { -+ bch2_verify_btree_nr_keys(b); -+ -+ gc_pos_set(c, gc_pos_btree_node(b)); -+ -+ ret = btree_gc_mark_node(trans, b, initial); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->btree_root_lock); -+ b = bch2_btree_id_root(c, btree_id)->b; -+ if (!btree_node_fake(b)) { -+ struct bkey_s_c k = bkey_i_to_s_c(&b->key); -+ -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, -+ true, &k, initial); -+ } -+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); -+ mutex_unlock(&c->btree_root_lock); -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, -+ unsigned target_depth) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf cur, prev; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); -+ bch2_bkey_buf_init(&prev); -+ bch2_bkey_buf_init(&cur); -+ bkey_init(&prev.k->k); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ BUG_ON(bpos_lt(k.k->p, b->data->min_key)); -+ BUG_ON(bpos_gt(k.k->p, b->data->max_key)); -+ -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, -+ false, &k, true); -+ if (ret) -+ goto fsck_err; -+ -+ if (b->c.level) { -+ bch2_bkey_buf_reassemble(&cur, c, k); -+ k = bkey_i_to_s_c(cur.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ ret = bch2_gc_check_topology(c, b, -+ &prev, cur, -+ !bch2_btree_and_journal_iter_peek(&iter).k); -+ if (ret) -+ goto fsck_err; -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ if (b->c.level > target_depth) { -+ bch2_btree_and_journal_iter_exit(&iter); -+ bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ struct btree *child; -+ -+ bch2_bkey_buf_reassemble(&cur, c, k); -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ child = bch2_btree_node_get_noiter(trans, cur.k, -+ b->c.btree_id, b->c.level - 1, -+ false); -+ ret = PTR_ERR_OR_ZERO(child); -+ -+ if (ret == -EIO) { -+ bch2_topology_error(c); -+ -+ if (__fsck_err(c, -+ FSCK_CAN_FIX| -+ FSCK_CAN_IGNORE| -+ FSCK_NO_RATELIMIT, -+ "Unreadable btree node at btree %s level %u:\n" -+ " %s", -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level - 1, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && -+ should_restart_for_topology_repair(c)) { -+ bch_info(c, "Halting mark and sweep to start topology repair pass"); -+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology); -+ goto fsck_err; -+ } else { -+ /* Continue marking when opted to not -+ * fix the error: */ -+ ret = 0; -+ set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); -+ continue; -+ } -+ } else if (ret) { -+ bch_err_msg(c, ret, "getting btree node"); -+ break; -+ } -+ -+ ret = bch2_gc_btree_init_recurse(trans, child, -+ target_depth); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } -+fsck_err: -+ bch2_bkey_buf_exit(&cur, c); -+ bch2_bkey_buf_exit(&prev, c); -+ bch2_btree_and_journal_iter_exit(&iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int bch2_gc_btree_init(struct btree_trans *trans, -+ enum btree_id btree_id, -+ bool metadata_only) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b; -+ unsigned target_depth = metadata_only ? 1 : 0; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ b = bch2_btree_id_root(c, btree_id)->b; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ printbuf_reset(&buf); -+ bch2_bpos_to_text(&buf, b->data->min_key); -+ if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, -+ "btree root with incorrect min_key: %s", buf.buf)) { -+ bch_err(c, "repair unimplemented"); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ goto fsck_err; -+ } -+ -+ printbuf_reset(&buf); -+ bch2_bpos_to_text(&buf, b->data->max_key); -+ if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, -+ "btree root with incorrect max_key: %s", buf.buf)) { -+ bch_err(c, "repair unimplemented"); -+ ret = -BCH_ERR_fsck_repair_unimplemented; -+ goto fsck_err; -+ } -+ -+ if (b->c.level >= target_depth) -+ ret = bch2_gc_btree_init_recurse(trans, b, target_depth); -+ -+ if (!ret) { -+ struct bkey_s_c k = bkey_i_to_s_c(&b->key); -+ -+ ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, -+ &k, true); -+ } -+fsck_err: -+ six_unlock_read(&b->c.lock); -+ -+ if (ret < 0) -+ bch_err_fn(c, ret); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -+{ -+ return (int) btree_id_to_gc_phase(l) - -+ (int) btree_id_to_gc_phase(r); -+} -+ -+static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) -+{ -+ struct btree_trans trans; -+ enum btree_id ids[BTREE_ID_NR]; -+ unsigned i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ if (initial) -+ trans.is_initial_gc = true; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ ids[i] = i; -+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); -+ -+ for (i = 0; i < BTREE_ID_NR && !ret; i++) -+ ret = initial -+ ? bch2_gc_btree_init(&trans, ids[i], metadata_only) -+ : bch2_gc_btree(&trans, ids[i], initial, metadata_only); -+ -+ for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { -+ if (!bch2_btree_id_root(c, i)->alive) -+ continue; -+ -+ ret = initial -+ ? bch2_gc_btree_init(&trans, i, metadata_only) -+ : bch2_gc_btree(&trans, i, initial, metadata_only); -+ } -+ -+ if (ret < 0) -+ bch_err_fn(c, ret); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, -+ u64 start, u64 end, -+ enum bch_data_type type, -+ unsigned flags) -+{ -+ u64 b = sector_to_bucket(ca, start); -+ -+ do { -+ unsigned sectors = -+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; -+ -+ bch2_mark_metadata_bucket(c, ca, b, type, sectors, -+ gc_phase(GC_PHASE_SB), flags); -+ b++; -+ start += sectors; -+ } while (start < end); -+} -+ -+static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, -+ unsigned flags) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ unsigned i; -+ u64 b; -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset == BCH_SB_SECTOR) -+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, -+ BCH_DATA_sb, flags); -+ -+ mark_metadata_sectors(c, ca, offset, -+ offset + (1 << layout->sb_max_size_bits), -+ BCH_DATA_sb, flags); -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) { -+ b = ca->journal.buckets[i]; -+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), flags); -+ } -+} -+ -+static void bch2_mark_superblocks(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&c->sb_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_SB)); -+ -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); -+ mutex_unlock(&c->sb_lock); -+} -+ -+#if 0 -+/* Also see bch2_pending_btree_node_free_insert_done() */ -+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -+{ -+ struct btree_update *as; -+ struct pending_btree_node_free *d; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); -+ -+ for_each_pending_btree_node_free(c, as, d) -+ if (d->index_update_done) -+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+#endif -+ -+static void bch2_gc_free(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ genradix_free(&c->reflink_gc_table); -+ genradix_free(&c->gc_stripes); -+ -+ for_each_member_device(ca, c, i) { -+ kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ ca->buckets_gc = NULL; -+ -+ free_percpu(ca->usage_gc); -+ ca->usage_gc = NULL; -+ } -+ -+ free_percpu(c->usage_gc); -+ c->usage_gc = NULL; -+} -+ -+static int bch2_gc_done(struct bch_fs *c, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca = NULL; -+ struct printbuf buf = PRINTBUF; -+ bool verify = !metadata_only && -+ !c->opts.reconstruct_alloc && -+ (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); -+ unsigned i, dev; -+ int ret = 0; -+ -+ percpu_down_write(&c->mark_lock); -+ -+#define copy_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f && \ -+ (!verify || \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ -+ , ##__VA_ARGS__, dst->_f, src->_f))) \ -+ dst->_f = src->_f -+#define copy_stripe_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f && \ -+ (!verify || \ -+ fsck_err(c, "stripe %zu has wrong "_msg \ -+ ": got %u, should be %u", \ -+ iter.pos, ##__VA_ARGS__, \ -+ dst->_f, src->_f))) \ -+ dst->_f = src->_f -+#define copy_dev_field(_f, _msg, ...) \ -+ copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) -+#define copy_fs_field(_f, _msg, ...) \ -+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) -+ -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ bch2_fs_usage_acc_to_base(c, i); -+ -+ for_each_member_device(ca, c, dev) { -+ struct bch_dev_usage *dst = ca->usage_base; -+ struct bch_dev_usage *src = (void *) -+ bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, -+ dev_usage_u64s()); -+ -+ copy_dev_field(buckets_ec, "buckets_ec"); -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); -+ copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); -+ copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); -+ } -+ }; -+ -+ { -+ unsigned nr = fs_usage_u64s(c); -+ struct bch_fs_usage *dst = c->usage_base; -+ struct bch_fs_usage *src = (void *) -+ bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); -+ -+ copy_fs_field(hidden, "hidden"); -+ copy_fs_field(btree, "btree"); -+ -+ if (!metadata_only) { -+ copy_fs_field(data, "data"); -+ copy_fs_field(cached, "cached"); -+ copy_fs_field(reserved, "reserved"); -+ copy_fs_field(nr_inodes,"nr_inodes"); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ copy_fs_field(persistent_reserved[i], -+ "persistent_reserved[%i]", i); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (metadata_only && -+ (e->data_type == BCH_DATA_user || -+ e->data_type == BCH_DATA_cached)) -+ continue; -+ -+ printbuf_reset(&buf); -+ bch2_replicas_entry_to_text(&buf, e); -+ -+ copy_fs_field(replicas[i], "%s", buf.buf); -+ } -+ } -+ -+#undef copy_fs_field -+#undef copy_dev_field -+#undef copy_stripe_field -+#undef copy_field -+fsck_err: -+ if (ca) -+ percpu_ref_put(&ca->ref); -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ percpu_up_write(&c->mark_lock); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int bch2_gc_start(struct bch_fs *c) -+{ -+ struct bch_dev *ca = NULL; -+ unsigned i; -+ -+ BUG_ON(c->usage_gc); -+ -+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), -+ sizeof(u64), GFP_KERNEL); -+ if (!c->usage_gc) { -+ bch_err(c, "error allocating c->usage_gc"); -+ return -BCH_ERR_ENOMEM_gc_start; -+ } -+ -+ for_each_member_device(ca, c, i) { -+ BUG_ON(ca->usage_gc); -+ -+ ca->usage_gc = alloc_percpu(struct bch_dev_usage); -+ if (!ca->usage_gc) { -+ bch_err(c, "error allocating ca->usage_gc"); -+ percpu_ref_put(&ca->ref); -+ return -BCH_ERR_ENOMEM_gc_start; -+ } -+ -+ this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, -+ ca->mi.nbuckets - ca->mi.first_bucket); -+ } -+ -+ return 0; -+} -+ -+static int bch2_gc_reset(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_member_device(ca, c, i) { -+ free_percpu(ca->usage_gc); -+ ca->usage_gc = NULL; -+ } -+ -+ free_percpu(c->usage_gc); -+ c->usage_gc = NULL; -+ -+ return bch2_gc_start(c); -+} -+ -+/* returns true if not equal */ -+static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, -+ struct bch_alloc_v4 r) -+{ -+ return l.gen != r.gen || -+ l.oldest_gen != r.oldest_gen || -+ l.data_type != r.data_type || -+ l.dirty_sectors != r.dirty_sectors || -+ l.cached_sectors != r.cached_sectors || -+ l.stripe_redundancy != r.stripe_redundancy || -+ l.stripe != r.stripe; -+} -+ -+static int bch2_alloc_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ bool metadata_only) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); -+ struct bucket gc, *b; -+ struct bkey_i_alloc_v4 *a; -+ struct bch_alloc_v4 old_convert, new; -+ const struct bch_alloc_v4 *old; -+ enum bch_data_type type; -+ int ret; -+ -+ if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) -+ return 1; -+ -+ old = bch2_alloc_to_v4(k, &old_convert); -+ new = *old; -+ -+ percpu_down_read(&c->mark_lock); -+ b = gc_bucket(ca, iter->pos.offset); -+ -+ /* -+ * b->data_type doesn't yet include need_discard & need_gc_gen states - -+ * fix that here: -+ */ -+ type = __alloc_data_type(b->dirty_sectors, -+ b->cached_sectors, -+ b->stripe, -+ *old, -+ b->data_type); -+ if (b->data_type != type) { -+ struct bch_dev_usage *u; -+ -+ preempt_disable(); -+ u = this_cpu_ptr(ca->usage_gc); -+ u->d[b->data_type].buckets--; -+ b->data_type = type; -+ u->d[b->data_type].buckets++; -+ preempt_enable(); -+ } -+ -+ gc = *b; -+ percpu_up_read(&c->mark_lock); -+ -+ if (metadata_only && -+ gc.data_type != BCH_DATA_sb && -+ gc.data_type != BCH_DATA_journal && -+ gc.data_type != BCH_DATA_btree) -+ return 0; -+ -+ if (gen_after(old->gen, gc.gen)) -+ return 0; -+ -+ if (c->opts.reconstruct_alloc || -+ fsck_err_on(new.data_type != gc.data_type, c, -+ "bucket %llu:%llu gen %u has wrong data_type" -+ ": got %s, should be %s", -+ iter->pos.inode, iter->pos.offset, -+ gc.gen, -+ bch2_data_types[new.data_type], -+ bch2_data_types[gc.data_type])) -+ new.data_type = gc.data_type; -+ -+#define copy_bucket_field(_f) \ -+ if (c->opts.reconstruct_alloc || \ -+ fsck_err_on(new._f != gc._f, c, \ -+ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ -+ ": got %u, should be %u", \ -+ iter->pos.inode, iter->pos.offset, \ -+ gc.gen, \ -+ bch2_data_types[gc.data_type], \ -+ new._f, gc._f)) \ -+ new._f = gc._f; \ -+ -+ copy_bucket_field(gen); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); -+ copy_bucket_field(stripe_redundancy); -+ copy_bucket_field(stripe); -+#undef copy_bucket_field -+ -+ if (!bch2_alloc_v4_cmp(*old, new)) -+ return 0; -+ -+ a = bch2_alloc_to_v4_mut(trans, k); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ return ret; -+ -+ a->v = new; -+ -+ /* -+ * The trigger normally makes sure this is set, but we're not running -+ * triggers: -+ */ -+ if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) -+ a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); -+ -+ ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); -+fsck_err: -+ return ret; -+} -+ -+static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_member_device(ca, c, i) { -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, -+ POS(ca->dev_idx, ca->mi.first_bucket), -+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW, -+ bch2_alloc_write_key(&trans, &iter, k, metadata_only)); -+ -+ if (ret < 0) { -+ bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); -+ percpu_ref_put(&ca->ref); -+ break; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; -+} -+ -+static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bucket *g; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ unsigned i; -+ int ret; -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!buckets) { -+ percpu_ref_put(&ca->ref); -+ bch_err(c, "error allocating ca->buckets[gc]"); -+ return -BCH_ERR_ENOMEM_gc_alloc_start; -+ } -+ -+ buckets->first_bucket = ca->mi.first_bucket; -+ buckets->nbuckets = ca->mi.nbuckets; -+ rcu_assign_pointer(ca->buckets_gc, buckets); -+ }; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ g = gc_bucket(ca, k.k->p.offset); -+ -+ a = bch2_alloc_to_v4(k, &a_convert); -+ -+ g->gen_valid = 1; -+ g->gen = a->gen; -+ -+ if (metadata_only && -+ (a->data_type == BCH_DATA_user || -+ a->data_type == BCH_DATA_cached || -+ a->data_type == BCH_DATA_parity)) { -+ g->data_type = a->data_type; -+ g->dirty_sectors = a->dirty_sectors; -+ g->cached_sectors = a->cached_sectors; -+ g->stripe = a->stripe; -+ g->stripe_redundancy = a->stripe_redundancy; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); -+ -+ return ret; -+} -+ -+static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *buckets = gc_bucket_array(ca); -+ struct bucket *g; -+ -+ for_each_bucket(g, buckets) { -+ if (metadata_only && -+ (g->data_type == BCH_DATA_user || -+ g->data_type == BCH_DATA_cached || -+ g->data_type == BCH_DATA_parity)) -+ continue; -+ g->data_type = 0; -+ g->dirty_sectors = 0; -+ g->cached_sectors = 0; -+ } -+ }; -+} -+ -+static int bch2_gc_write_reflink_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ size_t *idx) -+{ -+ struct bch_fs *c = trans->c; -+ const __le64 *refcount = bkey_refcount_c(k); -+ struct printbuf buf = PRINTBUF; -+ struct reflink_gc *r; -+ int ret = 0; -+ -+ if (!refcount) -+ return 0; -+ -+ while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && -+ r->offset < k.k->p.offset) -+ ++*idx; -+ -+ if (!r || -+ r->offset != k.k->p.offset || -+ r->size != k.k->size) { -+ bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); -+ return -EINVAL; -+ } -+ -+ if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, -+ "reflink key has wrong refcount:\n" -+ " %s\n" -+ " should be %u", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf), -+ r->refcount)) { -+ struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); -+ -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ if (!r->refcount) -+ new->k.type = KEY_TYPE_deleted; -+ else -+ *bkey_refcount(new) = cpu_to_le64(r->refcount); -+ } -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ size_t idx = 0; -+ int ret = 0; -+ -+ if (metadata_only) -+ return 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); -+ -+ c->reflink_gc_nr = 0; -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int bch2_gc_reflink_start(struct bch_fs *c, -+ bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct reflink_gc *r; -+ int ret = 0; -+ -+ if (metadata_only) -+ return 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ c->reflink_gc_nr = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ const __le64 *refcount = bkey_refcount_c(k); -+ -+ if (!refcount) -+ continue; -+ -+ r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, -+ GFP_KERNEL); -+ if (!r) { -+ ret = -BCH_ERR_ENOMEM_gc_reflink_start; -+ break; -+ } -+ -+ r->offset = k.k->p.offset; -+ r->size = k.k->size; -+ r->refcount = 0; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) -+{ -+ struct genradix_iter iter; -+ struct reflink_gc *r; -+ -+ genradix_for_each(&c->reflink_gc_table, iter, r) -+ r->refcount = 0; -+} -+ -+static int bch2_gc_write_stripes_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ const struct bch_stripe *s; -+ struct gc_stripe *m; -+ bool bad = false; -+ unsigned i; -+ int ret = 0; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ return 0; -+ -+ s = bkey_s_c_to_stripe(k).v; -+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset); -+ -+ for (i = 0; i < s->nr_blocks; i++) { -+ u32 old = stripe_blockcount_get(s, i); -+ u32 new = (m ? m->block_sectors[i] : 0); -+ -+ if (old != new) { -+ prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", -+ i, old, new); -+ bad = true; -+ } -+ } -+ -+ if (bad) -+ bch2_bkey_val_to_text(&buf, c, k); -+ -+ if (fsck_err_on(bad, c, "%s", buf.buf)) { -+ struct bkey_i_stripe *new; -+ -+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(&new->k_i, k); -+ -+ for (i = 0; i < new->v.nr_blocks; i++) -+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); -+ -+ ret = bch2_trans_update(trans, iter, &new->k_i, 0); -+ } -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (metadata_only) -+ return 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_stripes, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ bch2_gc_write_stripes_key(&trans, &iter, k)); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) -+{ -+ genradix_free(&c->gc_stripes); -+} -+ -+/** -+ * bch2_gc - walk _all_ references to buckets, and recompute them: -+ * -+ * Order matters here: -+ * - Concurrent GC relies on the fact that we have a total ordering for -+ * everything that GC walks - see gc_will_visit_node(), -+ * gc_will_visit_root() -+ * -+ * - also, references move around in the course of index updates and -+ * various other crap: everything needs to agree on the ordering -+ * references are allowed to move around in - e.g., we're allowed to -+ * start with a reference owned by an open_bucket (the allocator) and -+ * move it to the btree, but not the reverse. -+ * -+ * This is necessary to ensure that gc doesn't miss references that -+ * move around - if references move backwards in the ordering GC -+ * uses, GC could skip past them -+ */ -+int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) -+{ -+ unsigned iter = 0; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ down_write(&c->gc_lock); -+ -+ bch2_btree_interior_updates_flush(c); -+ -+ ret = bch2_gc_start(c) ?: -+ bch2_gc_alloc_start(c, metadata_only) ?: -+ bch2_gc_reflink_start(c, metadata_only); -+ if (ret) -+ goto out; -+again: -+ gc_pos_set(c, gc_phase(GC_PHASE_START)); -+ -+ bch2_mark_superblocks(c); -+ -+ ret = bch2_gc_btrees(c, initial, metadata_only); -+ -+ if (ret) -+ goto out; -+ -+#if 0 -+ bch2_mark_pending_btree_node_frees(c); -+#endif -+ c->gc_count++; -+ -+ if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || -+ (!iter && bch2_test_restart_gc)) { -+ if (iter++ > 2) { -+ bch_info(c, "Unable to fix bucket gens, looping"); -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ /* -+ * XXX: make sure gens we fixed got saved -+ */ -+ bch_info(c, "Second GC pass needed, restarting:"); -+ clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ bch2_gc_stripes_reset(c, metadata_only); -+ bch2_gc_alloc_reset(c, metadata_only); -+ bch2_gc_reflink_reset(c, metadata_only); -+ ret = bch2_gc_reset(c); -+ if (ret) -+ goto out; -+ -+ /* flush fsck errors, reset counters */ -+ bch2_flush_fsck_errs(c); -+ goto again; -+ } -+out: -+ if (!ret) { -+ bch2_journal_block(&c->journal); -+ -+ ret = bch2_gc_stripes_done(c, metadata_only) ?: -+ bch2_gc_reflink_done(c, metadata_only) ?: -+ bch2_gc_alloc_done(c, metadata_only) ?: -+ bch2_gc_done(c, initial, metadata_only); -+ -+ bch2_journal_unblock(&c->journal); -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ /* Indicates that gc is no longer in progress: */ -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ -+ up_write(&c->gc_lock); -+ -+ /* -+ * At startup, allocations can happen directly instead of via the -+ * allocator thread - issue wakeup in case they blocked on gc_lock: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int gc_btree_gens_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ struct bkey_i *u; -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ptr_stale(ca, ptr) > 16) { -+ percpu_up_read(&c->mark_lock); -+ goto update; -+ } -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; -+ -+ if (gen_after(*gen, ptr->gen)) -+ *gen = ptr->gen; -+ } -+ percpu_up_read(&c->mark_lock); -+ return 0; -+update: -+ u = bch2_bkey_make_mut(trans, iter, &k, 0); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ bch2_extent_normalize(c, bkey_i_to_s(u)); -+ return 0; -+} -+ -+static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); -+ struct bkey_i_alloc_v4 *a_mut; -+ int ret; -+ -+ if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) -+ return 0; -+ -+ a_mut = bch2_alloc_to_v4_mut(trans, k); -+ ret = PTR_ERR_OR_ZERO(a_mut); -+ if (ret) -+ return ret; -+ -+ a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; -+ a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); -+ -+ return bch2_trans_update(trans, iter, &a_mut->k_i, 0); -+} -+ -+int bch2_gc_gens(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ u64 b, start_time = local_clock(); -+ unsigned i; -+ int ret; -+ -+ /* -+ * Ideally we would be using state_lock and not gc_lock here, but that -+ * introduces a deadlock in the RO path - we currently take the state -+ * lock at the start of going RO, thus the gc thread may get stuck: -+ */ -+ if (!mutex_trylock(&c->gc_gens_lock)) -+ return 0; -+ -+ trace_and_count(c, gc_gens_start, c); -+ down_read(&c->gc_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_gens *gens; -+ -+ BUG_ON(ca->oldest_gen); -+ -+ ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); -+ if (!ca->oldest_gen) { -+ percpu_ref_put(&ca->ref); -+ ret = -BCH_ERR_ENOMEM_gc_gens; -+ goto err; -+ } -+ -+ gens = bucket_gens(ca); -+ -+ for (b = gens->first_bucket; -+ b < gens->nbuckets; b++) -+ ca->oldest_gen[b] = gens->b[b]; -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (btree_type_has_ptrs(i)) { -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ -+ c->gc_gens_btree = i; -+ c->gc_gens_pos = POS_MIN; -+ ret = for_each_btree_key_commit(&trans, iter, i, -+ POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, -+ k, -+ NULL, NULL, -+ BTREE_INSERT_NOFAIL, -+ gc_btree_gens_key(&trans, &iter, k)); -+ if (ret && !bch2_err_matches(ret, EROFS)) -+ bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); -+ if (ret) -+ goto err; -+ } -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, -+ POS_MIN, -+ BTREE_ITER_PREFETCH, -+ k, -+ NULL, NULL, -+ BTREE_INSERT_NOFAIL, -+ bch2_alloc_write_oldest_gen(&trans, &iter, k)); -+ if (ret && !bch2_err_matches(ret, EROFS)) -+ bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); -+ if (ret) -+ goto err; -+ -+ c->gc_gens_btree = 0; -+ c->gc_gens_pos = POS_MIN; -+ -+ c->gc_count++; -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); -+ trace_and_count(c, gc_gens_end, c); -+err: -+ for_each_member_device(ca, c, i) { -+ kvfree(ca->oldest_gen); -+ ca->oldest_gen = NULL; -+ } -+ -+ bch2_trans_exit(&trans); -+ up_read(&c->gc_lock); -+ mutex_unlock(&c->gc_gens_lock); -+ return ret; -+} -+ -+static int bch2_gc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last = atomic64_read(&clock->now); -+ unsigned last_kick = atomic_read(&c->kick_gc); -+ int ret; -+ -+ set_freezable(); -+ -+ while (1) { -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ if (kthread_should_stop()) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (atomic_read(&c->kick_gc) != last_kick) -+ break; -+ -+ if (c->btree_gc_periodic) { -+ unsigned long next = last + c->capacity / 16; -+ -+ if (atomic64_read(&clock->now) >= next) -+ break; -+ -+ bch2_io_clock_schedule_timeout(clock, next); -+ } else { -+ schedule(); -+ } -+ -+ try_to_freeze(); -+ } -+ __set_current_state(TASK_RUNNING); -+ -+ last = atomic64_read(&clock->now); -+ last_kick = atomic_read(&c->kick_gc); -+ -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ ret = bch2_gc(c, false, false); -+#else -+ ret = bch2_gc_gens(c); -+#endif -+ if (ret < 0) -+ bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); -+ -+ debug_check_no_locks_held(); -+ } -+ -+ return 0; -+} -+ -+void bch2_gc_thread_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ p = c->gc_thread; -+ c->gc_thread = NULL; -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_gc_thread_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ if (c->gc_thread) -+ return 0; -+ -+ p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); -+ if (IS_ERR(p)) { -+ bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); -+ return PTR_ERR(p); -+ } -+ -+ get_task_struct(p); -+ c->gc_thread = p; -+ wake_up_process(p); -+ return 0; -+} -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -new file mode 100644 -index 000000000..607575f83 ---- /dev/null -+++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,114 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_GC_H -+#define _BCACHEFS_BTREE_GC_H -+ -+#include "bkey.h" -+#include "btree_types.h" -+ -+int bch2_check_topology(struct bch_fs *); -+int bch2_gc(struct bch_fs *, bool, bool); -+int bch2_gc_gens(struct bch_fs *); -+void bch2_gc_thread_stop(struct bch_fs *); -+int bch2_gc_thread_start(struct bch_fs *); -+ -+/* -+ * For concurrent mark and sweep (with other index updates), we define a total -+ * ordering of _all_ references GC walks: -+ * -+ * Note that some references will have the same GC position as others - e.g. -+ * everything within the same btree node; in those cases we're relying on -+ * whatever locking exists for where those references live, i.e. the write lock -+ * on a btree node. -+ * -+ * That locking is also required to ensure GC doesn't pass the updater in -+ * between the updater adding/removing the reference and updating the GC marks; -+ * without that, we would at best double count sometimes. -+ * -+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ -+ * be held that prevents GC from passing the position the updater is at. -+ * -+ * (What about the start of gc, when we're clearing all the marks? GC clears the -+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc -+ * position inside its cmpxchg loop, so crap magically works). -+ */ -+ -+/* Position of (the start of) a gc phase: */ -+static inline struct gc_pos gc_phase(enum gc_phase phase) -+{ -+ return (struct gc_pos) { -+ .phase = phase, -+ .pos = POS_MIN, -+ .level = 0, -+ }; -+} -+ -+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -+{ -+ return cmp_int(l.phase, r.phase) ?: -+ bpos_cmp(l.pos, r.pos) ?: -+ cmp_int(l.level, r.level); -+} -+ -+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -+{ -+ switch (id) { -+#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; -+ BCH_BTREE_IDS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct gc_pos gc_pos_btree(enum btree_id id, -+ struct bpos pos, unsigned level) -+{ -+ return (struct gc_pos) { -+ .phase = btree_id_to_gc_phase(id), -+ .pos = pos, -+ .level = level, -+ }; -+} -+ -+/* -+ * GC position of the pointers within a btree node: note, _not_ for &b->key -+ * itself, that lives in the parent node: -+ */ -+static inline struct gc_pos gc_pos_btree_node(struct btree *b) -+{ -+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -+} -+ -+/* -+ * GC position of the pointer to a btree root: we don't use -+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with -+ * btree_split() increasing the tree depth - the new root will have level > the -+ * old root and thus have a greater gc position than the old root, but that -+ * would be incorrect since once gc has marked the root it's not coming back. -+ */ -+static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -+{ -+ return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); -+} -+ -+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -+{ -+ unsigned seq; -+ bool ret; -+ -+ do { -+ seq = read_seqcount_begin(&c->gc_pos_lock); -+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; -+ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); -+ -+ return ret; -+} -+ -+static inline void bch2_do_gc_gens(struct bch_fs *c) -+{ -+ atomic_inc(&c->kick_gc); -+ if (c->gc_thread) -+ wake_up_process(c->gc_thread); -+} -+ -+#endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -new file mode 100644 -index 000000000..cba3c081b ---- /dev/null -+++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2245 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "recovery.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+ -+void bch2_btree_node_io_unlock(struct btree *b) -+{ -+ EBUG_ON(!btree_node_write_in_flight(b)); -+ -+ clear_btree_node_write_in_flight_inner(b); -+ clear_btree_node_write_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+} -+ -+void bch2_btree_node_io_lock(struct btree *b) -+{ -+ bch2_assert_btree_nodes_not_locked(); -+ -+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+void __bch2_btree_node_wait_on_read(struct btree *b) -+{ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+void __bch2_btree_node_wait_on_write(struct btree *b) -+{ -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+void bch2_btree_node_wait_on_read(struct btree *b) -+{ -+ bch2_assert_btree_nodes_not_locked(); -+ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+void bch2_btree_node_wait_on_write(struct btree *b) -+{ -+ bch2_assert_btree_nodes_not_locked(); -+ -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static void verify_no_dups(struct btree *b, -+ struct bkey_packed *start, -+ struct bkey_packed *end) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bkey_packed *k, *p; -+ -+ if (start == end) -+ return; -+ -+ for (p = start, k = bkey_p_next(start); -+ k != end; -+ p = k, k = bkey_p_next(k)) { -+ struct bkey l = bkey_unpack_key(b, p); -+ struct bkey r = bkey_unpack_key(b, k); -+ -+ BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); -+ } -+#endif -+} -+ -+static void set_needs_whiteout(struct bset *i, int v) -+{ -+ struct bkey_packed *k; -+ -+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) -+ k->needs_whiteout = v; -+} -+ -+static void btree_bounce_free(struct bch_fs *c, size_t size, -+ bool used_mempool, void *p) -+{ -+ if (used_mempool) -+ mempool_free(p, &c->btree_bounce_pool); -+ else -+ vpfree(p, size); -+} -+ -+static void *btree_bounce_alloc(struct bch_fs *c, size_t size, -+ bool *used_mempool) -+{ -+ unsigned flags = memalloc_nofs_save(); -+ void *p; -+ -+ BUG_ON(size > btree_bytes(c)); -+ -+ *used_mempool = false; -+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); -+ if (!p) { -+ *used_mempool = true; -+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); -+ } -+ memalloc_nofs_restore(flags); -+ return p; -+} -+ -+static void sort_bkey_ptrs(const struct btree *bt, -+ struct bkey_packed **ptrs, unsigned nr) -+{ -+ unsigned n = nr, a = nr / 2, b, c, d; -+ -+ if (!a) -+ return; -+ -+ /* Heap sort: see lib/sort.c: */ -+ while (1) { -+ if (a) -+ a--; -+ else if (--n) -+ swap(ptrs[0], ptrs[n]); -+ else -+ break; -+ -+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) -+ b = bch2_bkey_cmp_packed(bt, -+ ptrs[c], -+ ptrs[d]) >= 0 ? c : d; -+ if (d == n) -+ b = c; -+ -+ while (b != a && -+ bch2_bkey_cmp_packed(bt, -+ ptrs[a], -+ ptrs[b]) >= 0) -+ b = (b - 1) / 2; -+ c = b; -+ while (b != a) { -+ b = (b - 1) / 2; -+ swap(ptrs[b], ptrs[c]); -+ } -+ } -+} -+ -+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; -+ bool used_mempool = false; -+ size_t bytes = b->whiteout_u64s * sizeof(u64); -+ -+ if (!b->whiteout_u64s) -+ return; -+ -+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); -+ -+ for (k = unwritten_whiteouts_start(c, b); -+ k != unwritten_whiteouts_end(c, b); -+ k = bkey_p_next(k)) -+ *--ptrs = k; -+ -+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); -+ -+ k = new_whiteouts; -+ -+ while (ptrs != ptrs_end) { -+ bkey_copy(k, *ptrs); -+ k = bkey_p_next(k); -+ ptrs++; -+ } -+ -+ verify_no_dups(b, new_whiteouts, -+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); -+ -+ memcpy_u64s(unwritten_whiteouts_start(c, b), -+ new_whiteouts, b->whiteout_u64s); -+ -+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -+} -+ -+static bool should_compact_bset(struct btree *b, struct bset_tree *t, -+ bool compacting, enum compact_mode mode) -+{ -+ if (!bset_dead_u64s(b, t)) -+ return false; -+ -+ switch (mode) { -+ case COMPACT_LAZY: -+ return should_compact_bset_lazy(b, t) || -+ (compacting && !bset_written(b, bset(b, t))); -+ case COMPACT_ALL: -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -+{ -+ struct bset_tree *t; -+ bool ret = false; -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ ret = true; -+ -+ if (!should_compact_bset(b, t, ret, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ start = btree_bkey_first(b, t); -+ end = btree_bkey_last(b, t); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_p_next(k); -+ -+ if (!bkey_deleted(k)) { -+ bkey_copy(out, k); -+ out = bkey_p_next(out); -+ } else { -+ BUG_ON(k->needs_whiteout); -+ } -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ ret = true; -+ } -+ -+ bch2_verify_btree_nr_keys(b); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return ret; -+} -+ -+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, -+ enum compact_mode mode) -+{ -+ return bch2_drop_whiteouts(b, mode); -+} -+ -+static void btree_node_sort(struct bch_fs *c, struct btree *b, -+ unsigned start_idx, -+ unsigned end_idx, -+ bool filter_whiteouts) -+{ -+ struct btree_node *out; -+ struct sort_iter sort_iter; -+ struct bset_tree *t; -+ struct bset *start_bset = bset(b, &b->set[start_idx]); -+ bool used_mempool = false; -+ u64 start_time, seq = 0; -+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; -+ bool sorting_entire_node = start_idx == 0 && -+ end_idx == b->nsets; -+ -+ sort_iter_init(&sort_iter, b); -+ -+ for (t = b->set + start_idx; -+ t < b->set + end_idx; -+ t++) { -+ u64s += le16_to_cpu(bset(b, t)->u64s); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ } -+ -+ bytes = sorting_entire_node -+ ? btree_bytes(c) -+ : __vstruct_bytes(struct btree_node, u64s); -+ -+ out = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ start_time = local_clock(); -+ -+ u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); -+ -+ out->keys.u64s = cpu_to_le16(u64s); -+ -+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); -+ -+ if (sorting_entire_node) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ /* Make sure we preserve bset journal_seq: */ -+ for (t = b->set + start_idx; t < b->set + end_idx; t++) -+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); -+ start_bset->journal_seq = cpu_to_le64(seq); -+ -+ if (sorting_entire_node) { -+ unsigned u64s = le16_to_cpu(out->keys.u64s); -+ -+ BUG_ON(bytes != btree_bytes(c)); -+ -+ /* -+ * Our temporary buffer is the same size as the btree node's -+ * buffer, we can just swap buffers instead of doing a big -+ * memcpy() -+ */ -+ *out = *b->data; -+ out->keys.u64s = cpu_to_le16(u64s); -+ swap(out, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ } else { -+ start_bset->u64s = out->keys.u64s; -+ memcpy_u64s(start_bset->start, -+ out->keys.start, -+ le16_to_cpu(out->keys.u64s)); -+ } -+ -+ for (i = start_idx + 1; i < end_idx; i++) -+ b->nr.bset_u64s[start_idx] += -+ b->nr.bset_u64s[i]; -+ -+ b->nsets -= shift; -+ -+ for (i = start_idx + 1; i < b->nsets; i++) { -+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; -+ b->set[i] = b->set[i + shift]; -+ } -+ -+ for (i = b->nsets; i < MAX_BSETS; i++) -+ b->nr.bset_u64s[i] = 0; -+ -+ set_btree_bset_end(b, &b->set[start_idx]); -+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); -+ -+ btree_bounce_free(c, bytes, used_mempool, out); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *c, -+ struct btree *dst, -+ struct btree *src) -+{ -+ struct btree_nr_keys nr; -+ struct btree_node_iter src_iter; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(dst->nsets != 1); -+ -+ bch2_bset_set_no_aux_tree(dst, dst->set); -+ -+ bch2_btree_node_iter_init_from_start(&src_iter, src); -+ -+ nr = bch2_sort_repack(btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ set_btree_bset_end(dst, dst->set); -+ -+ dst->nr.live_u64s += nr.live_u64s; -+ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; -+ dst->nr.packed_keys += nr.packed_keys; -+ dst->nr.unpacked_keys += nr.unpacked_keys; -+ -+ bch2_verify_btree_nr_keys(dst); -+} -+ -+#define SORT_CRIT (4096 / sizeof(u64)) -+ -+/* -+ * We're about to add another bset to the btree node, so if there's currently -+ * too many bsets - sort some of them together: -+ */ -+static bool btree_node_compact(struct bch_fs *c, struct btree *b) -+{ -+ unsigned unwritten_idx; -+ bool ret = false; -+ -+ for (unwritten_idx = 0; -+ unwritten_idx < b->nsets; -+ unwritten_idx++) -+ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) -+ break; -+ -+ if (b->nsets - unwritten_idx > 1) { -+ btree_node_sort(c, b, unwritten_idx, -+ b->nsets, false); -+ ret = true; -+ } -+ -+ if (unwritten_idx > 1) { -+ btree_node_sort(c, b, 0, unwritten_idx, false); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_build_aux_trees(struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ bch2_bset_build_aux_tree(b, t, -+ !bset_written(b, bset(b, t)) && -+ t == bset_tree_last(b)); -+} -+ -+/* -+ * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? -+ * -+ * The first bset is going to be of similar order to the size of the node, the -+ * last bset is bounded by btree_write_set_buffer(), which is set to keep the -+ * memmove on insert from being too expensive: the middle bset should, ideally, -+ * be the geometric mean of the first and the last. -+ * -+ * Returns true if the middle bset is greater than that geometric mean: -+ */ -+static inline bool should_compact_all(struct bch_fs *c, struct btree *b) -+{ -+ unsigned mid_u64s_bits = -+ (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; -+ -+ return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; -+} -+ -+/* -+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be -+ * inserted into -+ * -+ * Safe to call if there already is an unwritten bset - will only add a new bset -+ * if @b doesn't already have one. -+ * -+ * Returns true if we sorted (i.e. invalidated iterators -+ */ -+void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_node_entry *bne; -+ bool reinit_iter = false; -+ -+ EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); -+ BUG_ON(bset_written(b, bset(b, &b->set[1]))); -+ BUG_ON(btree_node_just_written(b)); -+ -+ if (b->nsets == MAX_BSETS && -+ !btree_node_write_in_flight(b) && -+ should_compact_all(c, b)) { -+ bch2_btree_node_write(c, b, SIX_LOCK_write, -+ BTREE_WRITE_init_next_bset); -+ reinit_iter = true; -+ } -+ -+ if (b->nsets == MAX_BSETS && -+ btree_node_compact(c, b)) -+ reinit_iter = true; -+ -+ BUG_ON(b->nsets >= MAX_BSETS); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ if (reinit_iter) -+ bch2_trans_node_reinit_iter(trans, b); -+} -+ -+static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ prt_printf(out, "%s level %u/%u\n ", -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level, -+ bch2_btree_id_root(c, b->c.btree_id)->level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+} -+ -+static void btree_err_msg(struct printbuf *out, struct bch_fs *c, -+ struct bch_dev *ca, -+ struct btree *b, struct bset *i, -+ unsigned offset, int write) -+{ -+ prt_printf(out, bch2_log_msg(c, "%s"), -+ write == READ -+ ? "error validating btree node " -+ : "corrupt btree node before write "); -+ if (ca) -+ prt_printf(out, "on %s ", ca->name); -+ prt_printf(out, "at btree "); -+ btree_pos_to_text(out, c, b); -+ -+ prt_printf(out, "\n node offset %u", b->written); -+ if (i) -+ prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); -+ prt_str(out, ": "); -+} -+ -+static int __btree_err(int ret, -+ struct bch_fs *c, -+ struct bch_dev *ca, -+ struct btree *b, -+ struct bset *i, -+ int write, -+ bool have_retry, -+ const char *fmt, ...) -+{ -+ struct printbuf out = PRINTBUF; -+ va_list args; -+ -+ btree_err_msg(&out, c, ca, b, i, b->written, write); -+ -+ va_start(args, fmt); -+ prt_vprintf(&out, fmt, args); -+ va_end(args); -+ -+ if (write == WRITE) { -+ bch2_print_string_as_lines(KERN_ERR, out.buf); -+ ret = c->opts.errors == BCH_ON_ERROR_continue -+ ? 0 -+ : -BCH_ERR_fsck_errors_not_fixed; -+ goto out; -+ } -+ -+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) -+ ret = -BCH_ERR_btree_node_read_err_fixable; -+ if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) -+ ret = -BCH_ERR_btree_node_read_err_bad_node; -+ -+ switch (ret) { -+ case -BCH_ERR_btree_node_read_err_fixable: -+ mustfix_fsck_err(c, "%s", out.buf); -+ ret = -BCH_ERR_fsck_fix; -+ break; -+ case -BCH_ERR_btree_node_read_err_want_retry: -+ case -BCH_ERR_btree_node_read_err_must_retry: -+ bch2_print_string_as_lines(KERN_ERR, out.buf); -+ break; -+ case -BCH_ERR_btree_node_read_err_bad_node: -+ bch2_print_string_as_lines(KERN_ERR, out.buf); -+ bch2_topology_error(c); -+ ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; -+ break; -+ case -BCH_ERR_btree_node_read_err_incompatible: -+ bch2_print_string_as_lines(KERN_ERR, out.buf); -+ ret = -BCH_ERR_fsck_errors_not_fixed; -+ break; -+ default: -+ BUG(); -+ } -+out: -+fsck_err: -+ printbuf_exit(&out); -+ return ret; -+} -+ -+#define btree_err(type, c, ca, b, i, msg, ...) \ -+({ \ -+ int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ -+ \ -+ if (_ret != -BCH_ERR_fsck_fix) { \ -+ ret = _ret; \ -+ goto fsck_err; \ -+ } \ -+ \ -+ *saw_error = true; \ -+}) -+ -+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -+ -+/* -+ * When btree topology repair changes the start or end of a node, that might -+ * mean we have to drop keys that are no longer inside the node: -+ */ -+__cold -+void bch2_btree_node_drop_keys_outside_node(struct btree *b) -+{ -+ struct bset_tree *t; -+ struct bkey_s_c k; -+ struct bkey unpacked; -+ struct btree_node_iter iter; -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k; -+ -+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) -+ if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) -+ break; -+ -+ if (k != i->start) { -+ unsigned shift = (u64 *) k - (u64 *) i->start; -+ -+ memmove_u64s_down(i->start, k, -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); -+ set_btree_bset_end(b, t); -+ } -+ -+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) -+ if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) -+ break; -+ -+ if (k != vstruct_last(i)) { -+ i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); -+ set_btree_bset_end(b, t); -+ } -+ } -+ -+ /* -+ * Always rebuild search trees: eytzinger search tree nodes directly -+ * depend on the values of min/max key: -+ */ -+ bch2_bset_set_no_aux_tree(b, b->set); -+ bch2_btree_build_aux_trees(b); -+ -+ for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { -+ BUG_ON(bpos_lt(k.k->p, b->data->min_key)); -+ BUG_ON(bpos_gt(k.k->p, b->data->max_key)); -+ } -+} -+ -+static int validate_bset(struct bch_fs *c, struct bch_dev *ca, -+ struct btree *b, struct bset *i, -+ unsigned offset, unsigned sectors, -+ int write, bool have_retry, bool *saw_error) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ int ret = 0; -+ -+ btree_err_on(!bch2_version_compatible(version), -+ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, -+ "unsupported bset version %u.%u", -+ BCH_VERSION_MAJOR(version), -+ BCH_VERSION_MINOR(version)); -+ -+ if (btree_err_on(version < c->sb.version_min, -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, -+ "bset version %u older than superblock version_min %u", -+ version, c->sb.version_min)) { -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->version_min = cpu_to_le16(version); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (btree_err_on(BCH_VERSION_MAJOR(version) > -+ BCH_VERSION_MAJOR(c->sb.version), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, -+ "bset version %u newer than superblock version %u", -+ version, c->sb.version)) { -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->version = cpu_to_le16(version); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ btree_err_on(BSET_SEPARATE_WHITEOUTS(i), -+ -BCH_ERR_btree_node_read_err_incompatible, c, ca, b, i, -+ "BSET_SEPARATE_WHITEOUTS no longer supported"); -+ -+ if (btree_err_on(offset + sectors > btree_sectors(c), -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, -+ "bset past end of btree node")) { -+ i->u64s = 0; -+ ret = 0; -+ goto out; -+ } -+ -+ btree_err_on(offset && !i->u64s, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, -+ "empty bset"); -+ -+ btree_err_on(BSET_OFFSET(i) && -+ BSET_OFFSET(i) != offset, -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "bset at wrong sector offset"); -+ -+ if (!offset) { -+ struct btree_node *bn = -+ container_of(i, struct btree_node, keys); -+ /* These indicate that we read the wrong btree node: */ -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ /* XXX endianness */ -+ btree_err_on(bp->seq != bn->keys.seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "incorrect sequence number (wrong btree node)"); -+ } -+ -+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, -+ "incorrect btree id"); -+ -+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, -+ "incorrect level"); -+ -+ if (!write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ if (BTREE_PTR_RANGE_UPDATED(bp)) { -+ b->data->min_key = bp->min_key; -+ b->data->max_key = b->key.k.p; -+ } -+ -+ btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "incorrect min_key: got %s should be %s", -+ (printbuf_reset(&buf1), -+ bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), -+ (printbuf_reset(&buf2), -+ bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); -+ } -+ -+ btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, i, -+ "incorrect max key %s", -+ (printbuf_reset(&buf1), -+ bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); -+ -+ if (write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ btree_err_on(bch2_bkey_format_invalid(c, &bn->format, write, &buf1), -+ -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, -+ "invalid bkey format: %s\n %s", buf1.buf, -+ (printbuf_reset(&buf2), -+ bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); -+ printbuf_reset(&buf1); -+ -+ compat_bformat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &bn->format); -+ } -+out: -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+static int bset_key_invalid(struct bch_fs *c, struct btree *b, -+ struct bkey_s_c k, -+ bool updated_range, int rw, -+ struct printbuf *err) -+{ -+ return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: -+ (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: -+ (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); -+} -+ -+static int validate_bset_keys(struct bch_fs *c, struct btree *b, -+ struct bset *i, int write, -+ bool have_retry, bool *saw_error) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ struct bkey_packed *k, *prev = NULL; -+ struct printbuf buf = PRINTBUF; -+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && -+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); -+ int ret = 0; -+ -+ for (k = i->start; -+ k != vstruct_last(i);) { -+ struct bkey_s u; -+ struct bkey tmp; -+ -+ if (btree_err_on(bkey_p_next(k) > vstruct_last(i), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, -+ "key extends past end of bset")) { -+ i->u64s = cpu_to_le16((u64 *) k - i->_data); -+ break; -+ } -+ -+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, -+ "invalid bkey format %u", k->format)) { -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_p_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ /* XXX: validate k->u64s */ -+ if (!write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ u = __bkey_disassemble(b, k, &tmp); -+ -+ printbuf_reset(&buf); -+ if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { -+ printbuf_reset(&buf); -+ prt_printf(&buf, "invalid bkey: "); -+ bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, u.s_c); -+ -+ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_p_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ if (write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ struct bkey up = bkey_unpack_key(b, prev); -+ -+ printbuf_reset(&buf); -+ prt_printf(&buf, "keys out of order: "); -+ bch2_bkey_to_text(&buf, &up); -+ prt_printf(&buf, " > "); -+ bch2_bkey_to_text(&buf, u.k); -+ -+ bch2_dump_bset(c, b, i, 0); -+ -+ if (btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf)) { -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_p_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ } -+ -+ prev = k; -+ k = bkey_p_next(k); -+ } -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, -+ struct btree *b, bool have_retry, bool *saw_error) -+{ -+ struct btree_node_entry *bne; -+ struct sort_iter *iter; -+ struct btree_node *sorted; -+ struct bkey_packed *k; -+ struct bch_extent_ptr *ptr; -+ struct bset *i; -+ bool used_mempool, blacklisted; -+ bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && -+ BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); -+ unsigned u64s; -+ unsigned blacklisted_written, nonblacklisted_written = 0; -+ unsigned ptr_written = btree_ptr_sectors_written(&b->key); -+ struct printbuf buf = PRINTBUF; -+ int ret = 0, retry_read = 0, write = READ; -+ -+ b->version_ondisk = U16_MAX; -+ /* We might get called multiple times on read retry: */ -+ b->written = 0; -+ -+ iter = mempool_alloc(&c->fill_iter, GFP_NOFS); -+ sort_iter_init(iter, b); -+ iter->size = (btree_blocks(c) + 1) * 2; -+ -+ if (bch2_meta_read_fault("btree")) -+ btree_err(-BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "dynamic fault"); -+ -+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "bad magic: want %llx, got %llx", -+ bset_magic(c), le64_to_cpu(b->data->magic)); -+ -+ btree_err_on(!b->data->keys.seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "bad btree header: seq 0"); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(b->data->keys.seq != bp->seq, -+ -BCH_ERR_btree_node_read_err_must_retry, c, ca, b, NULL, -+ "got wrong btree node (seq %llx want %llx)", -+ b->data->keys.seq, bp->seq); -+ } -+ -+ while (b->written < (ptr_written ?: btree_sectors(c))) { -+ unsigned sectors; -+ struct nonce nonce; -+ struct bch_csum csum; -+ bool first = !b->written; -+ -+ if (!b->written) { -+ i = &b->data->keys; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "unknown checksum type %llu", -+ BSET_CSUM_TYPE(i)); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -+ -+ btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "invalid checksum"); -+ -+ ret = bset_encrypt(c, i, b->written << 9); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "error decrypting btree node: %i", ret)) -+ goto fsck_err; -+ -+ btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), -+ -BCH_ERR_btree_node_read_err_incompatible, c, NULL, b, NULL, -+ "btree node does not have NEW_EXTENT_OVERWRITE set"); -+ -+ sectors = vstruct_sectors(b->data, c->block_bits); -+ } else { -+ bne = write_block(b); -+ i = &bne->keys; -+ -+ if (i->seq != b->data->keys.seq) -+ break; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "unknown checksum type %llu", -+ BSET_CSUM_TYPE(i)); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, i, -+ "invalid checksum"); -+ -+ ret = bset_encrypt(c, i, b->written << 9); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "error decrypting btree node: %i\n", ret)) -+ goto fsck_err; -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ b->version_ondisk = min(b->version_ondisk, -+ le16_to_cpu(i->version)); -+ -+ ret = validate_bset(c, ca, b, i, b->written, sectors, -+ READ, have_retry, saw_error); -+ if (ret) -+ goto fsck_err; -+ -+ if (!b->written) -+ btree_node_set_format(b, b->data->format); -+ -+ ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); -+ if (ret) -+ goto fsck_err; -+ -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ blacklisted = bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(i->journal_seq), -+ true); -+ -+ btree_err_on(blacklisted && first, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, -+ "first btree node bset has blacklisted journal seq (%llu)", -+ le64_to_cpu(i->journal_seq)); -+ -+ btree_err_on(blacklisted && ptr_written, -+ -BCH_ERR_btree_node_read_err_fixable, c, ca, b, i, -+ "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", -+ le64_to_cpu(i->journal_seq), -+ b->written, b->written + sectors, ptr_written); -+ -+ b->written += sectors; -+ -+ if (blacklisted && !first) -+ continue; -+ -+ sort_iter_add(iter, -+ vstruct_idx(i, 0), -+ vstruct_last(i)); -+ -+ nonblacklisted_written = b->written; -+ } -+ -+ if (ptr_written) { -+ btree_err_on(b->written < ptr_written, -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, -+ "btree node data missing: expected %u sectors, found %u", -+ ptr_written, b->written); -+ } else { -+ for (bne = write_block(b); -+ bset_byte_offset(b, bne) < btree_bytes(c); -+ bne = (void *) bne + block_bytes(c)) -+ btree_err_on(bne->keys.seq == b->data->keys.seq && -+ !bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(bne->keys.journal_seq), -+ true), -+ -BCH_ERR_btree_node_read_err_want_retry, c, ca, b, NULL, -+ "found bset signature after last bset"); -+ -+ /* -+ * Blacklisted bsets are those that were written after the most recent -+ * (flush) journal write. Since there wasn't a flush, they may not have -+ * made it to all devices - which means we shouldn't write new bsets -+ * after them, as that could leave a gap and then reads from that device -+ * wouldn't find all the bsets in that btree node - which means it's -+ * important that we start writing new bsets after the most recent _non_ -+ * blacklisted bset: -+ */ -+ blacklisted_written = b->written; -+ b->written = nonblacklisted_written; -+ } -+ -+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); -+ sorted->keys.u64s = 0; -+ -+ set_btree_bset(b, b->set, &b->data->keys); -+ -+ b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); -+ -+ u64s = le16_to_cpu(sorted->keys.u64s); -+ *sorted = *b->data; -+ sorted->keys.u64s = cpu_to_le16(u64s); -+ swap(sorted, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ b->nsets = 1; -+ -+ BUG_ON(b->nr.live_u64s != u64s); -+ -+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); -+ -+ if (updated_range) -+ bch2_btree_node_drop_keys_outside_node(b); -+ -+ i = &b->data->keys; -+ for (k = i->start; k != vstruct_last(i);) { -+ struct bkey tmp; -+ struct bkey_s u = __bkey_disassemble(b, k, &tmp); -+ -+ printbuf_reset(&buf); -+ -+ if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || -+ (bch2_inject_invalid_keys && -+ !bversion_cmp(u.k->version, MAX_VERSION))) { -+ printbuf_reset(&buf); -+ -+ prt_printf(&buf, "invalid bkey: "); -+ bch2_bkey_val_invalid(c, u.s_c, READ, &buf); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, u.s_c); -+ -+ btree_err(-BCH_ERR_btree_node_read_err_fixable, c, NULL, b, i, "%s", buf.buf); -+ -+ btree_keys_account_key_drop(&b->nr, 0, k); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_p_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ set_btree_bset_end(b, b->set); -+ continue; -+ } -+ -+ if (u.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); -+ -+ bp.v->mem_ptr = 0; -+ } -+ -+ k = bkey_p_next(k); -+ } -+ -+ bch2_bset_build_aux_tree(b, b->set, false); -+ -+ set_needs_whiteout(btree_bset_first(b), true); -+ -+ btree_node_reset_sib_u64s(b); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_rw) -+ set_btree_node_need_rewrite(b); -+ } -+ -+ if (!ptr_written) -+ set_btree_node_need_rewrite(b); -+out: -+ mempool_free(iter, &c->fill_iter); -+ printbuf_exit(&buf); -+ return retry_read; -+fsck_err: -+ if (ret == -BCH_ERR_btree_node_read_err_want_retry || -+ ret == -BCH_ERR_btree_node_read_err_must_retry) -+ retry_read = 1; -+ else -+ set_btree_node_read_error(b); -+ goto out; -+} -+ -+static void btree_node_read_work(struct work_struct *work) -+{ -+ struct btree_read_bio *rb = -+ container_of(work, struct btree_read_bio, work); -+ struct bch_fs *c = rb->c; -+ struct btree *b = rb->b; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ struct bio *bio = &rb->bio; -+ struct bch_io_failures failed = { .nr = 0 }; -+ struct printbuf buf = PRINTBUF; -+ bool saw_error = false; -+ bool retry = false; -+ bool can_retry; -+ -+ goto start; -+ while (1) { -+ retry = true; -+ bch_info(c, "retrying read"); -+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); -+ bio->bi_iter.bi_sector = rb->pick.ptr.offset; -+ bio->bi_iter.bi_size = btree_bytes(c); -+ -+ if (rb->have_ioref) { -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ submit_bio_wait(bio); -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ } -+start: -+ printbuf_reset(&buf); -+ btree_pos_to_text(&buf, c, b); -+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", -+ bch2_blk_status_to_str(bio->bi_status), buf.buf); -+ if (rb->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ rb->have_ioref = false; -+ -+ bch2_mark_io_failure(&failed, &rb->pick); -+ -+ can_retry = bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ &failed, &rb->pick) > 0; -+ -+ if (!bio->bi_status && -+ !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { -+ if (retry) -+ bch_info(c, "retry success"); -+ break; -+ } -+ -+ saw_error = true; -+ -+ if (!can_retry) { -+ set_btree_node_read_error(b); -+ break; -+ } -+ } -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -+ rb->start_time); -+ bio_put(&rb->bio); -+ printbuf_exit(&buf); -+ -+ if (saw_error && !btree_node_read_error(b)) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bpos_to_text(&buf, b->key.k.p); -+ bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", -+ __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); -+ printbuf_exit(&buf); -+ -+ bch2_btree_node_rewrite_async(c, b); -+ } -+ -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+} -+ -+static void btree_node_read_endio(struct bio *bio) -+{ -+ struct btree_read_bio *rb = -+ container_of(bio, struct btree_read_bio, bio); -+ struct bch_fs *c = rb->c; -+ -+ if (rb->have_ioref) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ -+ bch2_latency_acct(ca, rb->start_time, READ); -+ } -+ -+ queue_work(c->io_complete_wq, &rb->work); -+} -+ -+struct btree_node_read_all { -+ struct closure cl; -+ struct bch_fs *c; -+ struct btree *b; -+ unsigned nr; -+ void *buf[BCH_REPLICAS_MAX]; -+ struct bio *bio[BCH_REPLICAS_MAX]; -+ blk_status_t err[BCH_REPLICAS_MAX]; -+}; -+ -+static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) -+{ -+ struct btree_node *bn = data; -+ struct btree_node_entry *bne; -+ unsigned offset = 0; -+ -+ if (le64_to_cpu(bn->magic) != bset_magic(c)) -+ return 0; -+ -+ while (offset < btree_sectors(c)) { -+ if (!offset) { -+ offset += vstruct_sectors(bn, c->block_bits); -+ } else { -+ bne = data + (offset << 9); -+ if (bne->keys.seq != bn->keys.seq) -+ break; -+ offset += vstruct_sectors(bne, c->block_bits); -+ } -+ } -+ -+ return offset; -+} -+ -+static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) -+{ -+ struct btree_node *bn = data; -+ struct btree_node_entry *bne; -+ -+ if (!offset) -+ return false; -+ -+ while (offset < btree_sectors(c)) { -+ bne = data + (offset << 9); -+ if (bne->keys.seq == bn->keys.seq) -+ return true; -+ offset++; -+ } -+ -+ return false; -+ return offset; -+} -+ -+static void btree_node_read_all_replicas_done(struct closure *cl) -+{ -+ struct btree_node_read_all *ra = -+ container_of(cl, struct btree_node_read_all, cl); -+ struct bch_fs *c = ra->c; -+ struct btree *b = ra->b; -+ struct printbuf buf = PRINTBUF; -+ bool dump_bset_maps = false; -+ bool have_retry = false; -+ int ret = 0, best = -1, write = READ; -+ unsigned i, written = 0, written2 = 0; -+ __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 -+ ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; -+ bool _saw_error = false, *saw_error = &_saw_error; -+ -+ for (i = 0; i < ra->nr; i++) { -+ struct btree_node *bn = ra->buf[i]; -+ -+ if (ra->err[i]) -+ continue; -+ -+ if (le64_to_cpu(bn->magic) != bset_magic(c) || -+ (seq && seq != bn->keys.seq)) -+ continue; -+ -+ if (best < 0) { -+ best = i; -+ written = btree_node_sectors_written(c, bn); -+ continue; -+ } -+ -+ written2 = btree_node_sectors_written(c, ra->buf[i]); -+ if (btree_err_on(written2 != written, -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, -+ "btree node sectors written mismatch: %u != %u", -+ written, written2) || -+ btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, -+ "found bset signature after last bset") || -+ btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), -+ -BCH_ERR_btree_node_read_err_fixable, c, NULL, b, NULL, -+ "btree node replicas content mismatch")) -+ dump_bset_maps = true; -+ -+ if (written2 > written) { -+ written = written2; -+ best = i; -+ } -+ } -+fsck_err: -+ if (dump_bset_maps) { -+ for (i = 0; i < ra->nr; i++) { -+ struct btree_node *bn = ra->buf[i]; -+ struct btree_node_entry *bne = NULL; -+ unsigned offset = 0, sectors; -+ bool gap = false; -+ -+ if (ra->err[i]) -+ continue; -+ -+ printbuf_reset(&buf); -+ -+ while (offset < btree_sectors(c)) { -+ if (!offset) { -+ sectors = vstruct_sectors(bn, c->block_bits); -+ } else { -+ bne = ra->buf[i] + (offset << 9); -+ if (bne->keys.seq != bn->keys.seq) -+ break; -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ prt_printf(&buf, " %u-%u", offset, offset + sectors); -+ if (bne && bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(bne->keys.journal_seq), false)) -+ prt_printf(&buf, "*"); -+ offset += sectors; -+ } -+ -+ while (offset < btree_sectors(c)) { -+ bne = ra->buf[i] + (offset << 9); -+ if (bne->keys.seq == bn->keys.seq) { -+ if (!gap) -+ prt_printf(&buf, " GAP"); -+ gap = true; -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ prt_printf(&buf, " %u-%u", offset, offset + sectors); -+ if (bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(bne->keys.journal_seq), false)) -+ prt_printf(&buf, "*"); -+ } -+ offset++; -+ } -+ -+ bch_err(c, "replica %u:%s", i, buf.buf); -+ } -+ } -+ -+ if (best >= 0) { -+ memcpy(b->data, ra->buf[best], btree_bytes(c)); -+ ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); -+ } else { -+ ret = -1; -+ } -+ -+ if (ret) -+ set_btree_node_read_error(b); -+ else if (*saw_error) -+ bch2_btree_node_rewrite_async(c, b); -+ -+ for (i = 0; i < ra->nr; i++) { -+ mempool_free(ra->buf[i], &c->btree_bounce_pool); -+ bio_put(ra->bio[i]); -+ } -+ -+ closure_debug_destroy(&ra->cl); -+ kfree(ra); -+ printbuf_exit(&buf); -+ -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+} -+ -+static void btree_node_read_all_replicas_endio(struct bio *bio) -+{ -+ struct btree_read_bio *rb = -+ container_of(bio, struct btree_read_bio, bio); -+ struct bch_fs *c = rb->c; -+ struct btree_node_read_all *ra = rb->ra; -+ -+ if (rb->have_ioref) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ -+ bch2_latency_acct(ca, rb->start_time, READ); -+ } -+ -+ ra->err[rb->idx] = bio->bi_status; -+ closure_put(&ra->cl); -+} -+ -+/* -+ * XXX This allocates multiple times from the same mempools, and can deadlock -+ * under sufficient memory pressure (but is only a debug path) -+ */ -+static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) -+{ -+ struct bkey_s_c k = bkey_i_to_s_c(&b->key); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded pick; -+ struct btree_node_read_all *ra; -+ unsigned i; -+ -+ ra = kzalloc(sizeof(*ra), GFP_NOFS); -+ if (!ra) -+ return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; -+ -+ closure_init(&ra->cl, NULL); -+ ra->c = c; -+ ra->b = b; -+ ra->nr = bch2_bkey_nr_ptrs(k); -+ -+ for (i = 0; i < ra->nr; i++) { -+ ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); -+ ra->bio[i] = bio_alloc_bioset(NULL, -+ buf_pages(ra->buf[i], btree_bytes(c)), -+ REQ_OP_READ|REQ_SYNC|REQ_META, -+ GFP_NOFS, -+ &c->btree_bio); -+ } -+ -+ i = 0; -+ bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ struct btree_read_bio *rb = -+ container_of(ra->bio[i], struct btree_read_bio, bio); -+ rb->c = c; -+ rb->b = b; -+ rb->ra = ra; -+ rb->start_time = local_clock(); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ rb->idx = i; -+ rb->pick = pick; -+ rb->bio.bi_iter.bi_sector = pick.ptr.offset; -+ rb->bio.bi_end_io = btree_node_read_all_replicas_endio; -+ bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); -+ -+ if (rb->have_ioref) { -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], -+ bio_sectors(&rb->bio)); -+ bio_set_dev(&rb->bio, ca->disk_sb.bdev); -+ -+ closure_get(&ra->cl); -+ submit_bio(&rb->bio); -+ } else { -+ ra->err[i] = BLK_STS_REMOVED; -+ } -+ -+ i++; -+ } -+ -+ if (sync) { -+ closure_sync(&ra->cl); -+ btree_node_read_all_replicas_done(&ra->cl); -+ } else { -+ continue_at(&ra->cl, btree_node_read_all_replicas_done, -+ c->io_complete_wq); -+ } -+ -+ return 0; -+} -+ -+void bch2_btree_node_read(struct bch_fs *c, struct btree *b, -+ bool sync) -+{ -+ struct extent_ptr_decoded pick; -+ struct btree_read_bio *rb; -+ struct bch_dev *ca; -+ struct bio *bio; -+ int ret; -+ -+ trace_and_count(c, btree_node_read, c, b); -+ -+ if (bch2_verify_all_btree_replicas && -+ !btree_node_read_all_replicas(c, b, sync)) -+ return; -+ -+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick); -+ -+ if (ret <= 0) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "btree node read error: no device to read from\n at "); -+ btree_pos_to_text(&buf, c, b); -+ bch_err(c, "%s", buf.buf); -+ -+ if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && -+ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) -+ bch2_fatal_error(c); -+ -+ set_btree_node_read_error(b); -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+ printbuf_exit(&buf); -+ return; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ bio = bio_alloc_bioset(NULL, -+ buf_pages(b->data, btree_bytes(c)), -+ REQ_OP_READ|REQ_SYNC|REQ_META, -+ GFP_NOFS, -+ &c->btree_bio); -+ rb = container_of(bio, struct btree_read_bio, bio); -+ rb->c = c; -+ rb->b = b; -+ rb->ra = NULL; -+ rb->start_time = local_clock(); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ rb->pick = pick; -+ INIT_WORK(&rb->work, btree_node_read_work); -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bio->bi_end_io = btree_node_read_endio; -+ bch2_bio_map(bio, b->data, btree_bytes(c)); -+ -+ if (rb->have_ioref) { -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], -+ bio_sectors(bio)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ -+ if (sync) { -+ submit_bio_wait(bio); -+ -+ btree_node_read_work(&rb->work); -+ } else { -+ submit_bio(bio); -+ } -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ -+ if (sync) -+ btree_node_read_work(&rb->work); -+ else -+ queue_work(c->io_complete_wq, &rb->work); -+ } -+} -+ -+static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct bch_fs *c = trans->c; -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(trans, level != 0); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ BUG_ON(IS_ERR(b)); -+ -+ bkey_copy(&b->key, k); -+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); -+ -+ set_btree_node_read_in_flight(b); -+ -+ bch2_btree_node_read(c, b, true); -+ -+ if (btree_node_read_error(b)) { -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_btree_set_root_for_read(c, b); -+err: -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ return ret; -+} -+ -+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, -+ const struct bkey_i *k, unsigned level) -+{ -+ return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); -+ -+} -+ -+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, -+ struct btree_write *w) -+{ -+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); -+ -+ do { -+ old = new = v; -+ if (!(old & 1)) -+ break; -+ -+ new &= ~1UL; -+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); -+ -+ if (old & 1) -+ closure_put(&((struct btree_update *) new)->cl); -+ -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+} -+ -+static void __btree_node_write_done(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_write *w = btree_prev_write(b); -+ unsigned long old, new, v; -+ unsigned type = 0; -+ -+ bch2_btree_complete_write(c, b, w); -+ -+ v = READ_ONCE(b->flags); -+ do { -+ old = new = v; -+ -+ if ((old & (1U << BTREE_NODE_dirty)) && -+ (old & (1U << BTREE_NODE_need_write)) && -+ !(old & (1U << BTREE_NODE_never_write)) && -+ !(old & (1U << BTREE_NODE_write_blocked)) && -+ !(old & (1U << BTREE_NODE_will_make_reachable))) { -+ new &= ~(1U << BTREE_NODE_dirty); -+ new &= ~(1U << BTREE_NODE_need_write); -+ new |= (1U << BTREE_NODE_write_in_flight); -+ new |= (1U << BTREE_NODE_write_in_flight_inner); -+ new |= (1U << BTREE_NODE_just_written); -+ new ^= (1U << BTREE_NODE_write_idx); -+ -+ type = new & BTREE_WRITE_TYPE_MASK; -+ new &= ~BTREE_WRITE_TYPE_MASK; -+ } else { -+ new &= ~(1U << BTREE_NODE_write_in_flight); -+ new &= ~(1U << BTREE_NODE_write_in_flight_inner); -+ } -+ } while ((v = cmpxchg(&b->flags, old, new)) != old); -+ -+ if (new & (1U << BTREE_NODE_write_in_flight)) -+ __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); -+ else -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+} -+ -+static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_trans trans; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ __btree_node_write_done(c, b); -+ six_unlock_read(&b->c.lock); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void btree_node_write_work(struct work_struct *work) -+{ -+ struct btree_write_bio *wbio = -+ container_of(work, struct btree_write_bio, work); -+ struct bch_fs *c = wbio->wbio.c; -+ struct btree *b = wbio->wbio.bio.bi_private; -+ struct bch_extent_ptr *ptr; -+ int ret = 0; -+ -+ btree_bounce_free(c, -+ wbio->data_bytes, -+ wbio->wbio.used_mempool, -+ wbio->data); -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, -+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) -+ goto err; -+ -+ if (wbio->wbio.first_btree_write) { -+ if (wbio->wbio.failed.nr) { -+ -+ } -+ } else { -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, -+ BCH_WATERMARK_reclaim| -+ BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_NOCHECK_RW, -+ !wbio->wbio.failed.nr)); -+ if (ret) -+ goto err; -+ } -+out: -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+ return; -+err: -+ set_btree_node_noevict(b); -+ if (!bch2_err_matches(ret, EROFS)) -+ bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); -+ goto out; -+} -+ -+static void btree_node_write_endio(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_write_bio *orig = parent ?: wbio; -+ struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); -+ struct bch_fs *c = wbio->c; -+ struct btree *b = wbio->bio.bi_private; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ unsigned long flags; -+ -+ if (wbio->have_ioref) -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("btree")) { -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bch2_dev_list_add_dev(&orig->failed, wbio->dev); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ } -+ -+ if (wbio->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ -+ if (parent) { -+ bio_put(bio); -+ bio_endio(&parent->bio); -+ return; -+ } -+ -+ clear_btree_node_write_in_flight_inner(b); -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); -+ INIT_WORK(&wb->work, btree_node_write_work); -+ queue_work(c->btree_io_complete_wq, &wb->work); -+} -+ -+static int validate_bset_for_write(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors) -+{ -+ struct printbuf buf = PRINTBUF; -+ bool saw_error; -+ int ret; -+ -+ ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), -+ BKEY_TYPE_btree, WRITE, &buf); -+ -+ if (ret) -+ bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); -+ printbuf_exit(&buf); -+ if (ret) -+ return ret; -+ -+ ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: -+ validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); -+ if (ret) { -+ bch2_inconsistent_error(c); -+ dump_stack(); -+ } -+ -+ return ret; -+} -+ -+static void btree_write_submit(struct work_struct *work) -+{ -+ struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); -+ struct bch_extent_ptr *ptr; -+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+ -+ bkey_copy(&tmp.k, &wbio->key); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) -+ ptr->offset += wbio->sector_offset; -+ -+ bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, -+ &tmp.k, false); -+} -+ -+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) -+{ -+ struct btree_write_bio *wbio; -+ struct bset_tree *t; -+ struct bset *i; -+ struct btree_node *bn = NULL; -+ struct btree_node_entry *bne = NULL; -+ struct sort_iter sort_iter; -+ struct nonce nonce; -+ unsigned bytes_to_write, sectors_to_write, bytes, u64s; -+ u64 seq = 0; -+ bool used_mempool; -+ unsigned long old, new; -+ bool validate_before_checksum = false; -+ enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; -+ void *data; -+ int ret; -+ -+ if (flags & BTREE_WRITE_ALREADY_STARTED) -+ goto do_write; -+ -+ /* -+ * We may only have a read lock on the btree node - the dirty bit is our -+ * "lock" against racing with other threads that may be trying to start -+ * a write, we do a write iff we clear the dirty bit. Since setting the -+ * dirty bit requires a write lock, we can't race with other threads -+ * redirtying it: -+ */ -+ do { -+ old = new = READ_ONCE(b->flags); -+ -+ if (!(old & (1 << BTREE_NODE_dirty))) -+ return; -+ -+ if ((flags & BTREE_WRITE_ONLY_IF_NEED) && -+ !(old & (1 << BTREE_NODE_need_write))) -+ return; -+ -+ if (old & -+ ((1 << BTREE_NODE_never_write)| -+ (1 << BTREE_NODE_write_blocked))) -+ return; -+ -+ if (b->written && -+ (old & (1 << BTREE_NODE_will_make_reachable))) -+ return; -+ -+ if (old & (1 << BTREE_NODE_write_in_flight)) -+ return; -+ -+ if (flags & BTREE_WRITE_ONLY_IF_NEED) -+ type = new & BTREE_WRITE_TYPE_MASK; -+ new &= ~BTREE_WRITE_TYPE_MASK; -+ -+ new &= ~(1 << BTREE_NODE_dirty); -+ new &= ~(1 << BTREE_NODE_need_write); -+ new |= (1 << BTREE_NODE_write_in_flight); -+ new |= (1 << BTREE_NODE_write_in_flight_inner); -+ new |= (1 << BTREE_NODE_just_written); -+ new ^= (1 << BTREE_NODE_write_idx); -+ } while (cmpxchg_acquire(&b->flags, old, new) != old); -+ -+ if (new & (1U << BTREE_NODE_need_write)) -+ return; -+do_write: -+ BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); -+ -+ atomic_dec(&c->btree_cache.dirty); -+ -+ BUG_ON(btree_node_fake(b)); -+ BUG_ON((b->will_make_reachable != 0) != !b->written); -+ -+ BUG_ON(b->written >= btree_sectors(c)); -+ BUG_ON(b->written & (block_sectors(c) - 1)); -+ BUG_ON(bset_written(b, btree_bset_last(b))); -+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); -+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ bytes = !b->written -+ ? sizeof(struct btree_node) -+ : sizeof(struct btree_node_entry); -+ -+ bytes += b->whiteout_u64s * sizeof(u64); -+ -+ for_each_bset(b, t) { -+ i = bset(b, t); -+ -+ if (bset_written(b, i)) -+ continue; -+ -+ bytes += le16_to_cpu(i->u64s) * sizeof(u64); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ seq = max(seq, le64_to_cpu(i->journal_seq)); -+ } -+ -+ BUG_ON(b->written && !seq); -+ -+ /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ -+ bytes += 8; -+ -+ /* buffer must be a multiple of the block size */ -+ bytes = round_up(bytes, block_bytes(c)); -+ -+ data = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ if (!b->written) { -+ bn = data; -+ *bn = *b->data; -+ i = &bn->keys; -+ } else { -+ bne = data; -+ bne->keys = b->data->keys; -+ i = &bne->keys; -+ } -+ -+ i->journal_seq = cpu_to_le64(seq); -+ i->u64s = 0; -+ -+ sort_iter_add(&sort_iter, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b)); -+ SET_BSET_SEPARATE_WHITEOUTS(i, false); -+ -+ b->whiteout_u64s = 0; -+ -+ u64s = bch2_sort_keys(i->start, &sort_iter, false); -+ le16_add_cpu(&i->u64s, u64s); -+ -+ BUG_ON(!b->written && i->u64s != b->data->keys.u64s); -+ -+ set_needs_whiteout(i, false); -+ -+ /* do we have data to write? */ -+ if (b->written && !i->u64s) -+ goto nowrite; -+ -+ bytes_to_write = vstruct_end(i) - data; -+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; -+ -+ if (!b->written && -+ b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); -+ -+ memset(data + bytes_to_write, 0, -+ (sectors_to_write << 9) - bytes_to_write); -+ -+ BUG_ON(b->written + sectors_to_write > btree_sectors(c)); -+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); -+ BUG_ON(i->seq != b->data->keys.seq); -+ -+ i->version = cpu_to_le16(c->sb.version); -+ SET_BSET_OFFSET(i, b->written); -+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) -+ validate_before_checksum = true; -+ -+ /* validate_bset will be modifying: */ -+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) -+ validate_before_checksum = true; -+ -+ /* if we're going to be encrypting, check metadata validity first: */ -+ if (validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ ret = bset_encrypt(c, i, b->written << 9); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "error encrypting btree node: %i\n", ret)) -+ goto err; -+ -+ nonce = btree_nonce(i, b->written << 9); -+ -+ if (bn) -+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); -+ else -+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ /* if we're not encrypting, check metadata after checksumming: */ -+ if (!validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ /* -+ * We handle btree write errors by immediately halting the journal - -+ * after we've done that, we can't issue any subsequent btree writes -+ * because they might have pointers to new nodes that failed to write. -+ * -+ * Furthermore, there's no point in doing any more btree writes because -+ * with the journal stopped, we're never going to update the journal to -+ * reflect that those writes were done and the data flushed from the -+ * journal: -+ * -+ * Also on journal error, the pending write may have updates that were -+ * never journalled (interior nodes, see btree_update_nodes_written()) - -+ * it's critical that we don't do the write in that case otherwise we -+ * will have updates visible that weren't in the journal: -+ * -+ * Make sure to update b->written so bch2_btree_init_next() doesn't -+ * break: -+ */ -+ if (bch2_journal_error(&c->journal) || -+ c->opts.nochanges) -+ goto err; -+ -+ trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); -+ -+ wbio = container_of(bio_alloc_bioset(NULL, -+ buf_pages(data, sectors_to_write << 9), -+ REQ_OP_WRITE|REQ_META, -+ GFP_NOFS, -+ &c->btree_bio), -+ struct btree_write_bio, wbio.bio); -+ wbio_init(&wbio->wbio.bio); -+ wbio->data = data; -+ wbio->data_bytes = bytes; -+ wbio->sector_offset = b->written; -+ wbio->wbio.c = c; -+ wbio->wbio.used_mempool = used_mempool; -+ wbio->wbio.first_btree_write = !b->written; -+ wbio->wbio.bio.bi_end_io = btree_node_write_endio; -+ wbio->wbio.bio.bi_private = b; -+ -+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); -+ -+ bkey_copy(&wbio->key, &b->key); -+ -+ b->written += sectors_to_write; -+ -+ if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = -+ cpu_to_le16(b->written); -+ -+ atomic64_inc(&c->btree_write_stats[type].nr); -+ atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); -+ -+ INIT_WORK(&wbio->work, btree_write_submit); -+ queue_work(c->io_complete_wq, &wbio->work); -+ return; -+err: -+ set_btree_node_noevict(b); -+ b->written += sectors_to_write; -+nowrite: -+ btree_bounce_free(c, bytes, used_mempool, data); -+ __btree_node_write_done(c, b); -+} -+ -+/* -+ * Work that must be done with write lock held: -+ */ -+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -+{ -+ bool invalidated_iter = false; -+ struct btree_node_entry *bne; -+ struct bset_tree *t; -+ -+ if (!btree_node_just_written(b)) -+ return false; -+ -+ BUG_ON(b->whiteout_u64s); -+ -+ clear_btree_node_just_written(b); -+ -+ /* -+ * Note: immediately after write, bset_written() doesn't work - the -+ * amount of data we had to write after compaction might have been -+ * smaller than the offset of the last bset. -+ * -+ * However, we know that all bsets have been written here, as long as -+ * we're still holding the write lock: -+ */ -+ -+ /* -+ * XXX: decide if we really want to unconditionally sort down to a -+ * single bset: -+ */ -+ if (b->nsets > 1) { -+ btree_node_sort(c, b, 0, b->nsets, true); -+ invalidated_iter = true; -+ } else { -+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); -+ } -+ -+ for_each_bset(b, t) -+ set_needs_whiteout(bset(b, t), true); -+ -+ bch2_btree_verify(c, b); -+ -+ /* -+ * If later we don't unconditionally sort down to a single bset, we have -+ * to ensure this is still true: -+ */ -+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return invalidated_iter; -+} -+ -+/* -+ * Use this one if the node is intent locked: -+ */ -+void bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held, -+ unsigned flags) -+{ -+ if (lock_type_held == SIX_LOCK_intent || -+ (lock_type_held == SIX_LOCK_read && -+ six_lock_tryupgrade(&b->c.lock))) { -+ __bch2_btree_node_write(c, b, flags); -+ -+ /* don't cycle lock unnecessarily: */ -+ if (btree_node_just_written(b) && -+ six_trylock_write(&b->c.lock)) { -+ bch2_btree_post_write_cleanup(c, b); -+ six_unlock_write(&b->c.lock); -+ } -+ -+ if (lock_type_held == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ } else { -+ __bch2_btree_node_write(c, b, flags); -+ if (lock_type_held == SIX_LOCK_write && -+ btree_node_just_written(b)) -+ bch2_btree_post_write_cleanup(c, b); -+ } -+} -+ -+static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ bool ret = false; -+restart: -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) -+ if (test_bit(flag, &b->flags)) { -+ rcu_read_unlock(); -+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); -+ ret = true; -+ goto restart; -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+bool bch2_btree_flush_all_reads(struct bch_fs *c) -+{ -+ return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -+} -+ -+bool bch2_btree_flush_all_writes(struct bch_fs *c) -+{ -+ return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -+} -+ -+static const char * const bch2_btree_write_types[] = { -+#define x(t, n) [n] = #t, -+ BCH_BTREE_WRITE_TYPES() -+ NULL -+}; -+ -+void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ printbuf_tabstop_push(out, 20); -+ printbuf_tabstop_push(out, 10); -+ -+ prt_tab(out); -+ prt_str(out, "nr"); -+ prt_tab(out); -+ prt_str(out, "size"); -+ prt_newline(out); -+ -+ for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { -+ u64 nr = atomic64_read(&c->btree_write_stats[i].nr); -+ u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); -+ -+ prt_printf(out, "%s:", bch2_btree_write_types[i]); -+ prt_tab(out); -+ prt_u64(out, nr); -+ prt_tab(out); -+ prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); -+ prt_newline(out); -+ } -+} -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -new file mode 100644 -index 000000000..cd99bbb00 ---- /dev/null -+++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,228 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_IO_H -+#define _BCACHEFS_BTREE_IO_H -+ -+#include "bkey_methods.h" -+#include "bset.h" -+#include "btree_locking.h" -+#include "checksum.h" -+#include "extents.h" -+#include "io_types.h" -+ -+struct bch_fs; -+struct btree_write; -+struct btree; -+struct btree_iter; -+struct btree_node_read_all; -+ -+static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -+{ -+ if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) -+ atomic_inc(&c->btree_cache.dirty); -+} -+ -+static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) -+{ -+ if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) -+ atomic_dec(&c->btree_cache.dirty); -+} -+ -+static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) -+{ -+ return k->k.type == KEY_TYPE_btree_ptr_v2 -+ ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) -+ : 0; -+} -+ -+struct btree_read_bio { -+ struct bch_fs *c; -+ struct btree *b; -+ struct btree_node_read_all *ra; -+ u64 start_time; -+ unsigned have_ioref:1; -+ unsigned idx:7; -+ struct extent_ptr_decoded pick; -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+struct btree_write_bio { -+ struct work_struct work; -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ void *data; -+ unsigned data_bytes; -+ unsigned sector_offset; -+ struct bch_write_bio wbio; -+}; -+ -+void bch2_btree_node_io_unlock(struct btree *); -+void bch2_btree_node_io_lock(struct btree *); -+void __bch2_btree_node_wait_on_read(struct btree *); -+void __bch2_btree_node_wait_on_write(struct btree *); -+void bch2_btree_node_wait_on_read(struct btree *); -+void bch2_btree_node_wait_on_write(struct btree *); -+ -+enum compact_mode { -+ COMPACT_LAZY, -+ COMPACT_ALL, -+}; -+ -+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, -+ enum compact_mode); -+ -+static inline bool should_compact_bset_lazy(struct btree *b, -+ struct bset_tree *t) -+{ -+ unsigned total_u64s = bset_u64s(t); -+ unsigned dead_u64s = bset_dead_u64s(b, t); -+ -+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -+} -+ -+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (should_compact_bset_lazy(b, t)) -+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); -+ -+ return false; -+} -+ -+static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -+{ -+ return (struct nonce) {{ -+ [0] = cpu_to_le32(offset), -+ [1] = ((__le32 *) &i->seq)[0], -+ [2] = ((__le32 *) &i->seq)[1], -+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, -+ }}; -+} -+ -+static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -+{ -+ struct nonce nonce = btree_nonce(i, offset); -+ int ret; -+ -+ if (!offset) { -+ struct btree_node *bn = container_of(i, struct btree_node, keys); -+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; -+ -+ ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, -+ &bn->flags, bytes); -+ if (ret) -+ return ret; -+ -+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); -+ } -+ -+ return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, -+ vstruct_end(i) - (void *) i->_data); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); -+ -+void bch2_btree_node_drop_keys_outside_node(struct btree *); -+ -+void bch2_btree_build_aux_trees(struct btree *); -+void bch2_btree_init_next(struct btree_trans *, struct btree *); -+ -+int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, -+ struct btree *, bool, bool *); -+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); -+int bch2_btree_root_read(struct bch_fs *, enum btree_id, -+ const struct bkey_i *, unsigned); -+ -+void bch2_btree_complete_write(struct bch_fs *, struct btree *, -+ struct btree_write *); -+ -+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -+ -+enum btree_write_flags { -+ __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, -+ __BTREE_WRITE_ALREADY_STARTED, -+}; -+#define BTREE_WRITE_ONLY_IF_NEED BIT(__BTREE_WRITE_ONLY_IF_NEED) -+#define BTREE_WRITE_ALREADY_STARTED BIT(__BTREE_WRITE_ALREADY_STARTED) -+ -+void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); -+void bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type, unsigned); -+ -+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_held) -+{ -+ bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); -+} -+ -+bool bch2_btree_flush_all_reads(struct bch_fs *); -+bool bch2_btree_flush_all_writes(struct bch_fs *); -+ -+static inline void compat_bformat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bkey_format *f) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_inodes) { -+ swap(f->bits_per_field[BKEY_FIELD_INODE], -+ f->bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(f->field_offset[BKEY_FIELD_INODE], -+ f->field_offset[BKEY_FIELD_OFFSET]); -+ } -+ -+ if (version < bcachefs_metadata_version_snapshot && -+ (level || btree_type_has_snapshots(btree_id))) { -+ u64 max_packed = -+ ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); -+ -+ f->field_offset[BKEY_FIELD_SNAPSHOT] = write -+ ? 0 -+ : cpu_to_le64(U32_MAX - max_packed); -+ } -+} -+ -+static inline void compat_bpos(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bpos *p) -+{ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bpos_swab(p); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_inodes) -+ swap(p->inode, p->offset); -+} -+ -+static inline void compat_btree_node(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct btree_node *bn) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id_is_extents(btree_id) && -+ !bpos_eq(bn->min_key, POS_MIN) && -+ write) -+ bn->min_key = bpos_nosnap_predecessor(bn->min_key); -+ -+ if (version < bcachefs_metadata_version_snapshot && -+ write) -+ bn->max_key.snapshot = 0; -+ -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); -+ -+ if (version < bcachefs_metadata_version_snapshot && -+ !write) -+ bn->max_key.snapshot = U32_MAX; -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id_is_extents(btree_id) && -+ !bpos_eq(bn->min_key, POS_MIN) && -+ !write) -+ bn->min_key = bpos_nosnap_successor(bn->min_key); -+} -+ -+void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_IO_H */ -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -new file mode 100644 -index 000000000..21c2bc8a8 ---- /dev/null -+++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,3194 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "bkey_buf.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_journal_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "replicas.h" -+#include "snapshot.h" -+#include "trace.h" -+ -+#include -+#include -+ -+static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); -+static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, -+ struct btree_path *); -+ -+static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) -+{ -+#ifdef TRACK_PATH_ALLOCATED -+ return iter->ip_allocated; -+#else -+ return 0; -+#endif -+} -+ -+static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); -+ -+static inline int __btree_path_cmp(const struct btree_path *l, -+ enum btree_id r_btree_id, -+ bool r_cached, -+ struct bpos r_pos, -+ unsigned r_level) -+{ -+ /* -+ * Must match lock ordering as defined by __bch2_btree_node_lock: -+ */ -+ return cmp_int(l->btree_id, r_btree_id) ?: -+ cmp_int((int) l->cached, (int) r_cached) ?: -+ bpos_cmp(l->pos, r_pos) ?: -+ -cmp_int(l->level, r_level); -+} -+ -+static inline int btree_path_cmp(const struct btree_path *l, -+ const struct btree_path *r) -+{ -+ return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); -+} -+ -+static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) -+{ -+ /* Are we iterating over keys in all snapshots? */ -+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { -+ p = bpos_successor(p); -+ } else { -+ p = bpos_nosnap_successor(p); -+ p.snapshot = iter->snapshot; -+ } -+ -+ return p; -+} -+ -+static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) -+{ -+ /* Are we iterating over keys in all snapshots? */ -+ if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { -+ p = bpos_predecessor(p); -+ } else { -+ p = bpos_nosnap_predecessor(p); -+ p.snapshot = iter->snapshot; -+ } -+ -+ return p; -+} -+ -+static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ -+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ !bkey_eq(pos, POS_MAX)) -+ pos = bkey_successor(iter, pos); -+ return pos; -+} -+ -+static inline bool btree_path_pos_before_node(struct btree_path *path, -+ struct btree *b) -+{ -+ return bpos_lt(path->pos, b->data->min_key); -+} -+ -+static inline bool btree_path_pos_after_node(struct btree_path *path, -+ struct btree *b) -+{ -+ return bpos_gt(path->pos, b->key.k.p); -+} -+ -+static inline bool btree_path_pos_in_node(struct btree_path *path, -+ struct btree *b) -+{ -+ return path->btree_id == b->c.btree_id && -+ !btree_path_pos_before_node(path, b) && -+ !btree_path_pos_after_node(path, b); -+} -+ -+/* Btree iterator: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_btree_path_verify_cached(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ struct bkey_cached *ck; -+ bool locked = btree_node_locked(path, 0); -+ -+ if (!bch2_btree_node_relock(trans, path, 0)) -+ return; -+ -+ ck = (void *) path->l[0].b; -+ BUG_ON(ck->key.btree_id != path->btree_id || -+ !bkey_eq(ck->key.pos, path->pos)); -+ -+ if (!locked) -+ btree_node_unlock(trans, path, 0); -+} -+ -+static void bch2_btree_path_verify_level(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+ struct btree_path_level *l; -+ struct btree_node_iter tmp; -+ bool locked; -+ struct bkey_packed *p, *k; -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ struct printbuf buf3 = PRINTBUF; -+ const char *msg; -+ -+ if (!bch2_debug_check_iterators) -+ return; -+ -+ l = &path->l[level]; -+ tmp = l->iter; -+ locked = btree_node_locked(path, level); -+ -+ if (path->cached) { -+ if (!level) -+ bch2_btree_path_verify_cached(trans, path); -+ return; -+ } -+ -+ if (!btree_path_node(path, level)) -+ return; -+ -+ if (!bch2_btree_node_relock_notrace(trans, path, level)) -+ return; -+ -+ BUG_ON(!btree_path_pos_in_node(path, l->b)); -+ -+ bch2_btree_node_iter_verify(&l->iter, l->b); -+ -+ /* -+ * For interior nodes, the iterator will have skipped past deleted keys: -+ */ -+ p = level -+ ? bch2_btree_node_iter_prev(&tmp, l->b) -+ : bch2_btree_node_iter_prev_all(&tmp, l->b); -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { -+ msg = "before"; -+ goto err; -+ } -+ -+ if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { -+ msg = "after"; -+ goto err; -+ } -+ -+ if (!locked) -+ btree_node_unlock(trans, path, level); -+ return; -+err: -+ bch2_bpos_to_text(&buf1, path->pos); -+ -+ if (p) { -+ struct bkey uk = bkey_unpack_key(l->b, p); -+ -+ bch2_bkey_to_text(&buf2, &uk); -+ } else { -+ prt_printf(&buf2, "(none)"); -+ } -+ -+ if (k) { -+ struct bkey uk = bkey_unpack_key(l->b, k); -+ -+ bch2_bkey_to_text(&buf3, &uk); -+ } else { -+ prt_printf(&buf3, "(none)"); -+ } -+ -+ panic("path should be %s key at level %u:\n" -+ "path pos %s\n" -+ "prev key %s\n" -+ "cur key %s\n", -+ msg, level, buf1.buf, buf2.buf, buf3.buf); -+} -+ -+static void bch2_btree_path_verify(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i; -+ -+ EBUG_ON(path->btree_id >= BTREE_ID_NR); -+ -+ for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { -+ if (!path->l[i].b) { -+ BUG_ON(!path->cached && -+ bch2_btree_id_root(c, path->btree_id)->b->c.level > i); -+ break; -+ } -+ -+ bch2_btree_path_verify_level(trans, path, i); -+ } -+ -+ bch2_btree_path_verify_locks(path); -+} -+ -+void bch2_trans_verify_paths(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ bch2_btree_path_verify(trans, path); -+} -+ -+static void bch2_btree_iter_verify(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ -+ BUG_ON(iter->btree_id >= BTREE_ID_NR); -+ -+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); -+ -+ BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); -+ -+ BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && -+ (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && -+ !btree_type_has_snapshots(iter->btree_id)); -+ -+ if (iter->update_path) -+ bch2_btree_path_verify(trans, iter->update_path); -+ bch2_btree_path_verify(trans, iter->path); -+} -+ -+static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) -+{ -+ BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && -+ !iter->pos.snapshot); -+ -+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && -+ iter->pos.snapshot != iter->snapshot); -+ -+ BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || -+ bkey_gt(iter->pos, iter->k.p)); -+} -+ -+static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_iter copy; -+ struct bkey_s_c prev; -+ int ret = 0; -+ -+ if (!bch2_debug_check_iterators) -+ return 0; -+ -+ if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) -+ return 0; -+ -+ if (bkey_err(k) || !k.k) -+ return 0; -+ -+ BUG_ON(!bch2_snapshot_is_ancestor(trans->c, -+ iter->snapshot, -+ k.k->p.snapshot)); -+ -+ bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, -+ BTREE_ITER_NOPRESERVE| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ prev = bch2_btree_iter_prev(©); -+ if (!prev.k) -+ goto out; -+ -+ ret = bkey_err(prev); -+ if (ret) -+ goto out; -+ -+ if (bkey_eq(prev.k->p, k.k->p) && -+ bch2_snapshot_is_ancestor(trans->c, iter->snapshot, -+ prev.k->p.snapshot) > 0) { -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ -+ bch2_bkey_to_text(&buf1, k.k); -+ bch2_bkey_to_text(&buf2, prev.k); -+ -+ panic("iter snap %u\n" -+ "k %s\n" -+ "prev %s\n", -+ iter->snapshot, -+ buf1.buf, buf2.buf); -+ } -+out: -+ bch2_trans_iter_exit(trans, ©); -+ return ret; -+} -+ -+void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, -+ struct bpos pos, bool key_cache) -+{ -+ struct btree_path *path; -+ unsigned idx; -+ struct printbuf buf = PRINTBUF; -+ -+ btree_trans_sort_paths(trans); -+ -+ trans_for_each_path_inorder(trans, path, idx) { -+ int cmp = cmp_int(path->btree_id, id) ?: -+ cmp_int(path->cached, key_cache); -+ -+ if (cmp > 0) -+ break; -+ if (cmp < 0) -+ continue; -+ -+ if (!btree_node_locked(path, 0) || -+ !path->should_be_locked) -+ continue; -+ -+ if (!key_cache) { -+ if (bkey_ge(pos, path->l[0].b->data->min_key) && -+ bkey_le(pos, path->l[0].b->key.k.p)) -+ return; -+ } else { -+ if (bkey_eq(pos, path->pos)) -+ return; -+ } -+ } -+ -+ bch2_dump_trans_paths_updates(trans); -+ bch2_bpos_to_text(&buf, pos); -+ -+ panic("not locked: %s %s%s\n", -+ bch2_btree_ids[id], buf.buf, -+ key_cache ? " cached" : ""); -+} -+ -+#else -+ -+static inline void bch2_btree_path_verify_level(struct btree_trans *trans, -+ struct btree_path *path, unsigned l) {} -+static inline void bch2_btree_path_verify(struct btree_trans *trans, -+ struct btree_path *path) {} -+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -+static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -+static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } -+ -+#endif -+ -+/* Btree path: fixups after btree updates */ -+ -+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) { -+ set->k = __btree_node_key_to_offset(b, k); -+ bch2_btree_node_iter_sort(iter, b); -+ return; -+ } -+ -+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -+} -+ -+static void __bch2_btree_path_fix_key_modified(struct btree_path *path, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_path_level *l = &path->l[b->c.level]; -+ -+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) -+ return; -+ -+ if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+} -+ -+void bch2_btree_path_fix_key_modified(struct btree_trans *trans, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path_with_node(trans, b, path) { -+ __bch2_btree_path_fix_key_modified(path, b, where); -+ bch2_btree_path_verify_level(trans, path, b->c.level); -+ } -+} -+ -+static void __bch2_btree_node_iter_fix(struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bset_tree *t, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ const struct bkey_packed *end = btree_bkey_last(b, t); -+ struct btree_node_iter_set *set; -+ unsigned offset = __btree_node_key_to_offset(b, where); -+ int shift = new_u64s - clobber_u64s; -+ unsigned old_end = t->end_offset - shift; -+ unsigned orig_iter_pos = node_iter->data[0].k; -+ bool iter_current_key_modified = -+ orig_iter_pos >= offset && -+ orig_iter_pos <= offset + clobber_u64s; -+ -+ btree_node_iter_for_each(node_iter, set) -+ if (set->end == old_end) -+ goto found; -+ -+ /* didn't find the bset in the iterator - might have to readd it: */ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { -+ bch2_btree_node_iter_push(node_iter, b, where, end); -+ goto fixup_done; -+ } else { -+ /* Iterator is after key that changed */ -+ return; -+ } -+found: -+ set->end = t->end_offset; -+ -+ /* Iterator hasn't gotten to the key that changed yet: */ -+ if (set->k < offset) -+ return; -+ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { -+ set->k = offset; -+ } else if (set->k < offset + clobber_u64s) { -+ set->k = offset + new_u64s; -+ if (set->k == set->end) -+ bch2_btree_node_iter_set_drop(node_iter, set); -+ } else { -+ /* Iterator is after key that changed */ -+ set->k = (int) set->k + shift; -+ return; -+ } -+ -+ bch2_btree_node_iter_sort(node_iter, b); -+fixup_done: -+ if (node_iter->data[0].k != orig_iter_pos) -+ iter_current_key_modified = true; -+ -+ /* -+ * When a new key is added, and the node iterator now points to that -+ * key, the iterator might have skipped past deleted keys that should -+ * come after the key the iterator now points to. We have to rewind to -+ * before those deleted keys - otherwise -+ * bch2_btree_node_iter_prev_all() breaks: -+ */ -+ if (!bch2_btree_node_iter_end(node_iter) && -+ iter_current_key_modified && -+ b->c.level) { -+ struct bset_tree *t; -+ struct bkey_packed *k, *k2, *p; -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ -+ for_each_bset(b, t) { -+ bool set_pos = false; -+ -+ if (node_iter->data[0].end == t->end_offset) -+ continue; -+ -+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); -+ -+ while ((p = bch2_bkey_prev_all(b, t, k2)) && -+ bkey_iter_cmp(b, k, p) < 0) { -+ k2 = p; -+ set_pos = true; -+ } -+ -+ if (set_pos) -+ btree_node_iter_set_set_pos(node_iter, -+ b, t, k2); -+ } -+ } -+} -+ -+void bch2_btree_node_iter_fix(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); -+ struct btree_path *linked; -+ -+ if (node_iter != &path->l[b->c.level].iter) { -+ __bch2_btree_node_iter_fix(path, b, node_iter, t, -+ where, clobber_u64s, new_u64s); -+ -+ if (bch2_debug_check_iterators) -+ bch2_btree_node_iter_verify(node_iter, b); -+ } -+ -+ trans_for_each_path_with_node(trans, b, linked) { -+ __bch2_btree_node_iter_fix(linked, b, -+ &linked->l[b->c.level].iter, t, -+ where, clobber_u64s, new_u64s); -+ bch2_btree_path_verify_level(trans, linked, b->c.level); -+ } -+} -+ -+/* Btree path level: pointer to a particular btree node and node iter */ -+ -+static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, -+ struct btree_path_level *l, -+ struct bkey *u, -+ struct bkey_packed *k) -+{ -+ if (unlikely(!k)) { -+ /* -+ * signal to bch2_btree_iter_peek_slot() that we're currently at -+ * a hole -+ */ -+ u->type = KEY_TYPE_deleted; -+ return bkey_s_c_null; -+ } -+ -+ return bkey_disassemble(l->b, k, u); -+} -+ -+static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, -+ struct btree_path_level *l, -+ struct bkey *u) -+{ -+ return __btree_iter_unpack(c, l, u, -+ bch2_btree_node_iter_peek_all(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_path_level *l, -+ struct bkey *u) -+{ -+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+ -+ path->pos = k.k ? k.k->p : l->b->key.k.p; -+ trans->paths_sorted = false; -+ bch2_btree_path_verify_level(trans, path, l - path->l); -+ return k; -+} -+ -+static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_path_level *l, -+ struct bkey *u) -+{ -+ struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, -+ bch2_btree_node_iter_prev(&l->iter, l->b)); -+ -+ path->pos = k.k ? k.k->p : l->b->data->min_key; -+ trans->paths_sorted = false; -+ bch2_btree_path_verify_level(trans, path, l - path->l); -+ return k; -+} -+ -+static inline bool btree_path_advance_to_pos(struct btree_path *path, -+ struct btree_path_level *l, -+ int max_advance) -+{ -+ struct bkey_packed *k; -+ int nr_advanced = 0; -+ -+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && -+ bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { -+ if (max_advance > 0 && nr_advanced >= max_advance) -+ return false; -+ -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ nr_advanced++; -+ } -+ -+ return true; -+} -+ -+static inline void __btree_path_level_init(struct btree_path *path, -+ unsigned level) -+{ -+ struct btree_path_level *l = &path->l[level]; -+ -+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); -+ -+ /* -+ * Iterators to interior nodes should always be pointed at the first non -+ * whiteout: -+ */ -+ if (level) -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+} -+ -+void bch2_btree_path_level_init(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ BUG_ON(path->cached); -+ -+ EBUG_ON(!btree_path_pos_in_node(path, b)); -+ -+ path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); -+ path->l[b->c.level].b = b; -+ __btree_path_level_init(path, b->c.level); -+} -+ -+/* Btree path: fixups after btree node updates: */ -+ -+static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ if (!i->cached && -+ i->level == b->c.level && -+ i->btree_id == b->c.btree_id && -+ bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && -+ bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { -+ i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ struct bkey_i *j_k = -+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, -+ i->k->k.p); -+ -+ if (j_k) { -+ i->old_k = j_k->k; -+ i->old_v = &j_k->v; -+ } -+ } -+ } -+} -+ -+/* -+ * A btree node is being replaced - update the iterator to point to the new -+ * node: -+ */ -+void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ if (path->uptodate == BTREE_ITER_UPTODATE && -+ !path->cached && -+ btree_path_pos_in_node(path, b)) { -+ enum btree_node_locked_type t = -+ btree_lock_want(path, b->c.level); -+ -+ if (t != BTREE_NODE_UNLOCKED) { -+ btree_node_unlock(trans, path, b->c.level); -+ six_lock_increment(&b->c.lock, (enum six_lock_type) t); -+ mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); -+ } -+ -+ bch2_btree_path_level_init(trans, path, b); -+ } -+ -+ bch2_trans_revalidate_updates_in_node(trans, b); -+} -+ -+/* -+ * A btree node has been modified in such a way as to invalidate iterators - fix -+ * them: -+ */ -+void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path_with_node(trans, b, path) -+ __btree_path_level_init(path, b->c.level); -+ -+ bch2_trans_revalidate_updates_in_node(trans, b); -+} -+ -+/* Btree path: traverse, set_pos: */ -+ -+static inline int btree_path_lock_root(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned depth_want, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; -+ enum six_lock_type lock_type; -+ unsigned i; -+ int ret; -+ -+ EBUG_ON(path->nodes_locked); -+ -+ while (1) { -+ b = READ_ONCE(*rootp); -+ path->level = READ_ONCE(b->c.level); -+ -+ if (unlikely(path->level < depth_want)) { -+ /* -+ * the root is at a lower depth than the depth we want: -+ * got to the end of the btree, or we're walking nodes -+ * greater than some depth and there are no nodes >= -+ * that depth -+ */ -+ path->level = depth_want; -+ for (i = path->level; i < BTREE_MAX_DEPTH; i++) -+ path->l[i].b = NULL; -+ return 1; -+ } -+ -+ lock_type = __btree_lock_want(path, path->level); -+ ret = btree_node_lock(trans, path, &b->c, -+ path->level, lock_type, trace_ip); -+ if (unlikely(ret)) { -+ if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) -+ continue; -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ BUG(); -+ } -+ -+ if (likely(b == READ_ONCE(*rootp) && -+ b->c.level == path->level && -+ !race_fault())) { -+ for (i = 0; i < path->level; i++) -+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); -+ path->l[path->level].b = b; -+ for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) -+ path->l[i].b = NULL; -+ -+ mark_btree_node_locked(trans, path, path->level, lock_type); -+ bch2_btree_path_level_init(trans, path, b); -+ return 0; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ } -+} -+ -+noinline -+static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path_level *l = path_l(path); -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *k; -+ struct bkey_buf tmp; -+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) -+ ? (path->level > 1 ? 0 : 2) -+ : (path->level > 1 ? 1 : 16); -+ bool was_locked = btree_node_locked(path, path->level); -+ int ret = 0; -+ -+ bch2_bkey_buf_init(&tmp); -+ -+ while (nr-- && !ret) { -+ if (!bch2_btree_node_relock(trans, path, path->level)) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, l->b); -+ k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!k) -+ break; -+ -+ bch2_bkey_buf_unpack(&tmp, c, l->b, k); -+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, -+ path->level - 1); -+ } -+ -+ if (!was_locked) -+ btree_node_unlock(trans, path, path->level); -+ -+ bch2_bkey_buf_exit(&tmp, c); -+ return ret; -+} -+ -+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, -+ struct btree_and_journal_iter *jiter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct bkey_buf tmp; -+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) -+ ? (path->level > 1 ? 0 : 2) -+ : (path->level > 1 ? 1 : 16); -+ bool was_locked = btree_node_locked(path, path->level); -+ int ret = 0; -+ -+ bch2_bkey_buf_init(&tmp); -+ -+ while (nr-- && !ret) { -+ if (!bch2_btree_node_relock(trans, path, path->level)) -+ break; -+ -+ bch2_btree_and_journal_iter_advance(jiter); -+ k = bch2_btree_and_journal_iter_peek(jiter); -+ if (!k.k) -+ break; -+ -+ bch2_bkey_buf_reassemble(&tmp, c, k); -+ ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, -+ path->level - 1); -+ } -+ -+ if (!was_locked) -+ btree_node_unlock(trans, path, path->level); -+ -+ bch2_bkey_buf_exit(&tmp, c); -+ return ret; -+} -+ -+static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned plevel, struct btree *b) -+{ -+ struct btree_path_level *l = &path->l[plevel]; -+ bool locked = btree_node_locked(path, plevel); -+ struct bkey_packed *k; -+ struct bch_btree_ptr_v2 *bp; -+ -+ if (!bch2_btree_node_relock(trans, path, plevel)) -+ return; -+ -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); -+ -+ bp = (void *) bkeyp_val(&l->b->format, k); -+ bp->mem_ptr = (unsigned long)b; -+ -+ if (!locked) -+ btree_node_unlock(trans, path, plevel); -+} -+ -+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned flags, -+ struct bkey_buf *out) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path_level *l = path_l(path); -+ struct btree_and_journal_iter jiter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); -+ -+ k = bch2_btree_and_journal_iter_peek(&jiter); -+ -+ bch2_bkey_buf_reassemble(out, c, k); -+ -+ if (flags & BTREE_ITER_PREFETCH) -+ ret = btree_path_prefetch_j(trans, path, &jiter); -+ -+ bch2_btree_and_journal_iter_exit(&jiter); -+ return ret; -+} -+ -+static __always_inline int btree_path_down(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned flags, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path_level *l = path_l(path); -+ struct btree *b; -+ unsigned level = path->level - 1; -+ enum six_lock_type lock_type = __btree_lock_want(path, level); -+ struct bkey_buf tmp; -+ int ret; -+ -+ EBUG_ON(!btree_node_locked(path, path->level)); -+ -+ bch2_bkey_buf_init(&tmp); -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); -+ if (ret) -+ goto err; -+ } else { -+ bch2_bkey_buf_unpack(&tmp, c, l->b, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+ -+ if (flags & BTREE_ITER_PREFETCH) { -+ ret = btree_path_prefetch(trans, path); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); -+ ret = PTR_ERR_OR_ZERO(b); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (likely(!trans->journal_replay_not_finished && -+ tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && -+ unlikely(b != btree_node_mem_ptr(tmp.k))) -+ btree_node_mem_ptr_set(trans, path, level + 1, b); -+ -+ if (btree_node_read_locked(path, level + 1)) -+ btree_node_unlock(trans, path, level + 1); -+ -+ mark_btree_node_locked(trans, path, level, lock_type); -+ path->level = level; -+ bch2_btree_path_level_init(trans, path, b); -+ -+ bch2_btree_path_verify_locks(path); -+err: -+ bch2_bkey_buf_exit(&tmp, c); -+ return ret; -+} -+ -+ -+static int bch2_btree_path_traverse_all(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; -+ unsigned long trace_ip = _RET_IP_; -+ int i, ret = 0; -+ -+ if (trans->in_traverse_all) -+ return -BCH_ERR_transaction_restart_in_traverse_all; -+ -+ trans->in_traverse_all = true; -+retry_all: -+ trans->restarted = 0; -+ trans->last_restarted_ip = 0; -+ -+ trans_for_each_path(trans, path) -+ path->should_be_locked = false; -+ -+ btree_trans_sort_paths(trans); -+ -+ bch2_trans_unlock(trans); -+ cond_resched(); -+ -+ if (unlikely(trans->memory_allocation_failure)) { -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ } -+ -+ /* Now, redo traversals in correct order: */ -+ i = 0; -+ while (i < trans->nr_sorted) { -+ path = trans->paths + trans->sorted[i]; -+ -+ /* -+ * Traversing a path can cause another path to be added at about -+ * the same position: -+ */ -+ if (path->uptodate) { -+ __btree_path_get(path, false); -+ ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); -+ __btree_path_put(path, false); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -+ bch2_err_matches(ret, ENOMEM)) -+ goto retry_all; -+ if (ret) -+ goto err; -+ } else { -+ i++; -+ } -+ } -+ -+ /* -+ * We used to assert that all paths had been traversed here -+ * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since -+ * path->should_be_locked is not set yet, we might have unlocked and -+ * then failed to relock a path - that's fine. -+ */ -+err: -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ trans->in_traverse_all = false; -+ -+ trace_and_count(c, trans_traverse_all, trans, trace_ip); -+ return ret; -+} -+ -+static inline bool btree_path_check_pos_in_node(struct btree_path *path, -+ unsigned l, int check_pos) -+{ -+ if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) -+ return false; -+ if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) -+ return false; -+ return true; -+} -+ -+static inline bool btree_path_good_node(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned l, int check_pos) -+{ -+ return is_btree_node(path, l) && -+ bch2_btree_node_relock(trans, path, l) && -+ btree_path_check_pos_in_node(path, l, check_pos); -+} -+ -+static void btree_path_set_level_down(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_level) -+{ -+ unsigned l; -+ -+ path->level = new_level; -+ -+ for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) -+ if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(trans, path, l); -+ -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ bch2_btree_path_verify(trans, path); -+} -+ -+static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, -+ struct btree_path *path, -+ int check_pos) -+{ -+ unsigned i, l = path->level; -+again: -+ while (btree_path_node(path, l) && -+ !btree_path_good_node(trans, path, l, check_pos)) -+ __btree_path_set_level_up(trans, path, l++); -+ -+ /* If we need intent locks, take them too: */ -+ for (i = l + 1; -+ i < path->locks_want && btree_path_node(path, i); -+ i++) -+ if (!bch2_btree_node_relock(trans, path, i)) { -+ while (l <= i) -+ __btree_path_set_level_up(trans, path, l++); -+ goto again; -+ } -+ -+ return l; -+} -+ -+static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, -+ struct btree_path *path, -+ int check_pos) -+{ -+ return likely(btree_node_locked(path, path->level) && -+ btree_path_check_pos_in_node(path, path->level, check_pos)) -+ ? path->level -+ : __btree_path_up_until_good_node(trans, path, check_pos); -+} -+ -+/* -+ * This is the main state machine for walking down the btree - walks down to a -+ * specified depth -+ * -+ * Returns 0 on success, -EIO on error (error reading in a btree node). -+ * -+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is -+ * stashed in the iterator and returned from bch2_trans_exit(). -+ */ -+int bch2_btree_path_traverse_one(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned flags, -+ unsigned long trace_ip) -+{ -+ unsigned depth_want = path->level; -+ int ret = -((int) trans->restarted); -+ -+ if (unlikely(ret)) -+ goto out; -+ -+ /* -+ * Ensure we obey path->should_be_locked: if it's set, we can't unlock -+ * and re-traverse the path without a transaction restart: -+ */ -+ if (path->should_be_locked) { -+ ret = bch2_btree_path_relock(trans, path, trace_ip); -+ goto out; -+ } -+ -+ if (path->cached) { -+ ret = bch2_btree_path_traverse_cached(trans, path, flags); -+ goto out; -+ } -+ -+ if (unlikely(path->level >= BTREE_MAX_DEPTH)) -+ goto out; -+ -+ path->level = btree_path_up_until_good_node(trans, path, 0); -+ -+ EBUG_ON(btree_path_node(path, path->level) && -+ !btree_node_locked(path, path->level)); -+ -+ /* -+ * Note: path->nodes[path->level] may be temporarily NULL here - that -+ * would indicate to other code that we got to the end of the btree, -+ * here it indicates that relocking the root failed - it's critical that -+ * btree_path_lock_root() comes next and that it can't fail -+ */ -+ while (path->level > depth_want) { -+ ret = btree_path_node(path, path->level) -+ ? btree_path_down(trans, path, flags, trace_ip) -+ : btree_path_lock_root(trans, path, depth_want, trace_ip); -+ if (unlikely(ret)) { -+ if (ret == 1) { -+ /* -+ * No nodes at this level - got to the end of -+ * the btree: -+ */ -+ ret = 0; -+ goto out; -+ } -+ -+ __bch2_btree_path_unlock(trans, path); -+ path->level = depth_want; -+ path->l[path->level].b = ERR_PTR(ret); -+ goto out; -+ } -+ } -+ -+ path->uptodate = BTREE_ITER_UPTODATE; -+out: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) -+ panic("ret %s (%i) trans->restarted %s (%i)\n", -+ bch2_err_str(ret), ret, -+ bch2_err_str(trans->restarted), trans->restarted); -+ bch2_btree_path_verify(trans, path); -+ return ret; -+} -+ -+static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, -+ struct btree_path *src) -+{ -+ unsigned i, offset = offsetof(struct btree_path, pos); -+ -+ memcpy((void *) dst + offset, -+ (void *) src + offset, -+ sizeof(struct btree_path) - offset); -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) { -+ unsigned t = btree_node_locked_type(dst, i); -+ -+ if (t != BTREE_NODE_UNLOCKED) -+ six_lock_increment(&dst->l[i].b->c.lock, t); -+ } -+} -+ -+static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, -+ bool intent) -+{ -+ struct btree_path *new = btree_path_alloc(trans, src); -+ -+ btree_path_copy(trans, new, src); -+ __btree_path_get(new, intent); -+ return new; -+} -+ -+__flatten -+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, -+ struct btree_path *path, bool intent, -+ unsigned long ip) -+{ -+ __btree_path_put(path, intent); -+ path = btree_path_clone(trans, path, intent); -+ path->preserve = false; -+ return path; -+} -+ -+struct btree_path * __must_check -+__bch2_btree_path_set_pos(struct btree_trans *trans, -+ struct btree_path *path, struct bpos new_pos, -+ bool intent, unsigned long ip, int cmp) -+{ -+ unsigned level = path->level; -+ -+ bch2_trans_verify_not_in_restart(trans); -+ EBUG_ON(!path->ref); -+ -+ path = bch2_btree_path_make_mut(trans, path, intent, ip); -+ -+ path->pos = new_pos; -+ trans->paths_sorted = false; -+ -+ if (unlikely(path->cached)) { -+ btree_node_unlock(trans, path, 0); -+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ goto out; -+ } -+ -+ level = btree_path_up_until_good_node(trans, path, cmp); -+ -+ if (btree_path_node(path, level)) { -+ struct btree_path_level *l = &path->l[level]; -+ -+ BUG_ON(!btree_node_locked(path, level)); -+ /* -+ * We might have to skip over many keys, or just a few: try -+ * advancing the node iterator, and if we have to skip over too -+ * many keys just reinit it (or if we're rewinding, since that -+ * is expensive). -+ */ -+ if (cmp < 0 || -+ !btree_path_advance_to_pos(path, l, 8)) -+ bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); -+ -+ /* -+ * Iterators to interior nodes should always be pointed at the first non -+ * whiteout: -+ */ -+ if (unlikely(level)) -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+ } -+ -+ if (unlikely(level != path->level)) { -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ __bch2_btree_path_unlock(trans, path); -+ } -+out: -+ bch2_btree_path_verify(trans, path); -+ return path; -+} -+ -+/* Btree path: main interface: */ -+ -+static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) -+{ -+ struct btree_path *sib; -+ -+ sib = prev_btree_path(trans, path); -+ if (sib && !btree_path_cmp(sib, path)) -+ return sib; -+ -+ sib = next_btree_path(trans, path); -+ if (sib && !btree_path_cmp(sib, path)) -+ return sib; -+ -+ return NULL; -+} -+ -+static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) -+{ -+ struct btree_path *sib; -+ -+ sib = prev_btree_path(trans, path); -+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) -+ return sib; -+ -+ sib = next_btree_path(trans, path); -+ if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) -+ return sib; -+ -+ return NULL; -+} -+ -+static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) -+{ -+ __bch2_btree_path_unlock(trans, path); -+ btree_path_list_remove(trans, path); -+ trans->paths_allocated &= ~(1ULL << path->idx); -+} -+ -+void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) -+{ -+ struct btree_path *dup; -+ -+ EBUG_ON(trans->paths + path->idx != path); -+ EBUG_ON(!path->ref); -+ -+ if (!__btree_path_put(path, intent)) -+ return; -+ -+ dup = path->preserve -+ ? have_path_at_pos(trans, path) -+ : have_node_at_pos(trans, path); -+ -+ if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) -+ return; -+ -+ if (path->should_be_locked && -+ !trans->restarted && -+ (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) -+ return; -+ -+ if (dup) { -+ dup->preserve |= path->preserve; -+ dup->should_be_locked |= path->should_be_locked; -+ } -+ -+ __bch2_path_free(trans, path); -+} -+ -+static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, -+ bool intent) -+{ -+ EBUG_ON(trans->paths + path->idx != path); -+ EBUG_ON(!path->ref); -+ -+ if (!__btree_path_put(path, intent)) -+ return; -+ -+ __bch2_path_free(trans, path); -+} -+ -+void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) -+{ -+ panic("trans->restart_count %u, should be %u, last restarted by %pS\n", -+ trans->restart_count, restart_count, -+ (void *) trans->last_begin_ip); -+} -+ -+void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans) -+{ -+ panic("in transaction restart: %s, last restarted by %pS\n", -+ bch2_err_str(trans->restarted), -+ (void *) trans->last_restarted_ip); -+} -+ -+noinline __cold -+void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ struct btree_write_buffered_key *wb; -+ -+ prt_printf(buf, "transaction updates for %s journal seq %llu", -+ trans->fn, trans->journal_res.seq); -+ prt_newline(buf); -+ printbuf_indent_add(buf, 2); -+ -+ trans_for_each_update(trans, i) { -+ struct bkey_s_c old = { &i->old_k, i->old_v }; -+ -+ prt_printf(buf, "update: btree=%s cached=%u %pS", -+ bch2_btree_ids[i->btree_id], -+ i->cached, -+ (void *) i->ip_allocated); -+ prt_newline(buf); -+ -+ prt_printf(buf, " old "); -+ bch2_bkey_val_to_text(buf, trans->c, old); -+ prt_newline(buf); -+ -+ prt_printf(buf, " new "); -+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); -+ prt_newline(buf); -+ } -+ -+ trans_for_each_wb_update(trans, wb) { -+ prt_printf(buf, "update: btree=%s wb=1 %pS", -+ bch2_btree_ids[wb->btree], -+ (void *) i->ip_allocated); -+ prt_newline(buf); -+ -+ prt_printf(buf, " new "); -+ bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); -+ prt_newline(buf); -+ } -+ -+ printbuf_indent_sub(buf, 2); -+} -+ -+noinline __cold -+void bch2_dump_trans_updates(struct btree_trans *trans) -+{ -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_trans_updates_to_text(&buf, trans); -+ bch2_print_string_as_lines(KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+} -+ -+noinline __cold -+void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) -+{ -+ prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", -+ path->idx, path->ref, path->intent_ref, -+ path->preserve ? 'P' : ' ', -+ path->should_be_locked ? 'S' : ' ', -+ bch2_btree_ids[path->btree_id], -+ path->level); -+ bch2_bpos_to_text(out, path->pos); -+ -+ prt_printf(out, " locks %u", path->nodes_locked); -+#ifdef TRACK_PATH_ALLOCATED -+ prt_printf(out, " %pS", (void *) path->ip_allocated); -+#endif -+ prt_newline(out); -+} -+ -+static noinline __cold -+void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, -+ bool nosort) -+{ -+ struct btree_path *path; -+ unsigned idx; -+ -+ if (!nosort) -+ btree_trans_sort_paths(trans); -+ -+ trans_for_each_path_inorder(trans, path, idx) -+ bch2_btree_path_to_text(out, path); -+} -+ -+noinline __cold -+void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) -+{ -+ __bch2_trans_paths_to_text(out, trans, false); -+} -+ -+static noinline __cold -+void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) -+{ -+ struct printbuf buf = PRINTBUF; -+ -+ __bch2_trans_paths_to_text(&buf, trans, nosort); -+ bch2_trans_updates_to_text(&buf, trans); -+ -+ bch2_print_string_as_lines(KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+} -+ -+noinline __cold -+void bch2_dump_trans_paths_updates(struct btree_trans *trans) -+{ -+ __bch2_dump_trans_paths_updates(trans, false); -+} -+ -+noinline __cold -+static void bch2_trans_update_max_paths(struct btree_trans *trans) -+{ -+ struct btree_transaction_stats *s = btree_trans_stats(trans); -+ struct printbuf buf = PRINTBUF; -+ -+ if (!s) -+ return; -+ -+ bch2_trans_paths_to_text(&buf, trans); -+ -+ if (!buf.allocation_failure) { -+ mutex_lock(&s->lock); -+ if (s->nr_max_paths < hweight64(trans->paths_allocated)) { -+ s->nr_max_paths = trans->nr_max_paths = -+ hweight64(trans->paths_allocated); -+ swap(s->max_paths_text, buf.buf); -+ } -+ mutex_unlock(&s->lock); -+ } -+ -+ printbuf_exit(&buf); -+ -+ trans->nr_max_paths = hweight64(trans->paths_allocated); -+} -+ -+static noinline void btree_path_overflow(struct btree_trans *trans) -+{ -+ bch2_dump_trans_paths_updates(trans); -+ panic("trans path oveflow\n"); -+} -+ -+static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, -+ struct btree_path *pos) -+{ -+ struct btree_path *path; -+ unsigned idx; -+ -+ if (unlikely(trans->paths_allocated == -+ ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) -+ btree_path_overflow(trans); -+ -+ idx = __ffs64(~trans->paths_allocated); -+ -+ /* -+ * Do this before marking the new path as allocated, since it won't be -+ * initialized yet: -+ */ -+ if (unlikely(idx > trans->nr_max_paths)) -+ bch2_trans_update_max_paths(trans); -+ -+ trans->paths_allocated |= 1ULL << idx; -+ -+ path = &trans->paths[idx]; -+ path->idx = idx; -+ path->ref = 0; -+ path->intent_ref = 0; -+ path->nodes_locked = 0; -+ -+ btree_path_list_add(trans, pos, path); -+ trans->paths_sorted = false; -+ return path; -+} -+ -+struct btree_path *bch2_path_get(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ unsigned locks_want, unsigned level, -+ unsigned flags, unsigned long ip) -+{ -+ struct btree_path *path, *path_pos = NULL; -+ bool cached = flags & BTREE_ITER_CACHED; -+ bool intent = flags & BTREE_ITER_INTENT; -+ int i; -+ -+ bch2_trans_verify_not_in_restart(trans); -+ bch2_trans_verify_locks(trans); -+ -+ btree_trans_sort_paths(trans); -+ -+ trans_for_each_path_inorder(trans, path, i) { -+ if (__btree_path_cmp(path, -+ btree_id, -+ cached, -+ pos, -+ level) > 0) -+ break; -+ -+ path_pos = path; -+ } -+ -+ if (path_pos && -+ path_pos->cached == cached && -+ path_pos->btree_id == btree_id && -+ path_pos->level == level) { -+ __btree_path_get(path_pos, intent); -+ path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); -+ } else { -+ path = btree_path_alloc(trans, path_pos); -+ path_pos = NULL; -+ -+ __btree_path_get(path, intent); -+ path->pos = pos; -+ path->btree_id = btree_id; -+ path->cached = cached; -+ path->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ path->should_be_locked = false; -+ path->level = level; -+ path->locks_want = locks_want; -+ path->nodes_locked = 0; -+ for (i = 0; i < ARRAY_SIZE(path->l); i++) -+ path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -+#ifdef TRACK_PATH_ALLOCATED -+ path->ip_allocated = ip; -+#endif -+ trans->paths_sorted = false; -+ } -+ -+ if (!(flags & BTREE_ITER_NOPRESERVE)) -+ path->preserve = true; -+ -+ if (path->intent_ref) -+ locks_want = max(locks_want, level + 1); -+ -+ /* -+ * If the path has locks_want greater than requested, we don't downgrade -+ * it here - on transaction restart because btree node split needs to -+ * upgrade locks, we might be putting/getting the iterator again. -+ * Downgrading iterators only happens via bch2_trans_downgrade(), after -+ * a successful transaction commit. -+ */ -+ -+ locks_want = min(locks_want, BTREE_MAX_DEPTH); -+ if (locks_want > path->locks_want) -+ bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); -+ -+ return path; -+} -+ -+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) -+{ -+ -+ struct btree_path_level *l = path_l(path); -+ struct bkey_packed *_k; -+ struct bkey_s_c k; -+ -+ if (unlikely(!l->b)) -+ return bkey_s_c_null; -+ -+ EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); -+ EBUG_ON(!btree_node_locked(path, path->level)); -+ -+ if (!path->cached) { -+ _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; -+ -+ EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); -+ -+ if (!k.k || !bpos_eq(path->pos, k.k->p)) -+ goto hole; -+ } else { -+ struct bkey_cached *ck = (void *) path->l[0].b; -+ -+ EBUG_ON(ck && -+ (path->btree_id != ck->key.btree_id || -+ !bkey_eq(path->pos, ck->key.pos))); -+ if (!ck || !ck->valid) -+ return bkey_s_c_null; -+ -+ *u = ck->k->k; -+ k = bkey_i_to_s_c(ck->k); -+ } -+ -+ return k; -+hole: -+ bkey_init(u); -+ u->p = path->pos; -+ return (struct bkey_s_c) { u, NULL }; -+} -+ -+/* Btree iterators: */ -+ -+int __must_check -+__bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); -+} -+ -+int __must_check -+bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ int ret; -+ -+ iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, -+ btree_iter_search_key(iter), -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); -+ if (ret) -+ return ret; -+ -+ btree_path_set_should_be_locked(iter->path); -+ return 0; -+} -+ -+/* Iterate across nodes (leaf and interior nodes) */ -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b = NULL; -+ int ret; -+ -+ EBUG_ON(iter->path->cached); -+ bch2_btree_iter_verify(iter); -+ -+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ if (ret) -+ goto err; -+ -+ b = btree_path_node(iter->path, iter->path->level); -+ if (!b) -+ goto out; -+ -+ BUG_ON(bpos_lt(b->key.k.p, iter->pos)); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = b->key.k.p; -+ -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ btree_path_set_should_be_locked(iter->path); -+out: -+ bch2_btree_iter_verify_entry_exit(iter); -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+err: -+ b = ERR_PTR(ret); -+ goto out; -+} -+ -+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) -+{ -+ struct btree *b; -+ -+ while (b = bch2_btree_iter_peek_node(iter), -+ bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) -+ bch2_trans_begin(iter->trans); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_path *path = iter->path; -+ struct btree *b = NULL; -+ int ret; -+ -+ bch2_trans_verify_not_in_restart(trans); -+ EBUG_ON(iter->path->cached); -+ bch2_btree_iter_verify(iter); -+ -+ /* already at end? */ -+ if (!btree_path_node(path, path->level)) -+ return NULL; -+ -+ /* got to end? */ -+ if (!btree_path_node(path, path->level + 1)) { -+ btree_path_set_level_up(trans, path); -+ return NULL; -+ } -+ -+ if (!bch2_btree_node_relock(trans, path, path->level + 1)) { -+ __bch2_btree_path_unlock(trans, path); -+ path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); -+ path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); -+ goto err; -+ } -+ -+ b = btree_path_node(path, path->level + 1); -+ -+ if (bpos_eq(iter->pos, b->key.k.p)) { -+ __btree_path_set_level_up(trans, path, path->level++); -+ } else { -+ /* -+ * Haven't gotten to the end of the parent node: go back down to -+ * the next child node -+ */ -+ path = iter->path = -+ bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ btree_path_set_level_down(trans, path, iter->min_depth); -+ -+ ret = bch2_btree_path_traverse(trans, path, iter->flags); -+ if (ret) -+ goto err; -+ -+ b = path->l[path->level].b; -+ } -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = b->key.k.p; -+ -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ btree_path_set_should_be_locked(iter->path); -+ BUG_ON(iter->path->uptodate); -+out: -+ bch2_btree_iter_verify_entry_exit(iter); -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+err: -+ b = ERR_PTR(ret); -+ goto out; -+} -+ -+/* Iterate across keys (in leaf nodes only) */ -+ -+inline bool bch2_btree_iter_advance(struct btree_iter *iter) -+{ -+ if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { -+ struct bpos pos = iter->k.p; -+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS -+ ? bpos_eq(pos, SPOS_MAX) -+ : bkey_eq(pos, SPOS_MAX)); -+ -+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ pos = bkey_successor(iter, pos); -+ bch2_btree_iter_set_pos(iter, pos); -+ return ret; -+ } else { -+ if (!btree_path_node(iter->path, iter->path->level)) -+ return true; -+ -+ iter->advanced = true; -+ return false; -+ } -+} -+ -+inline bool bch2_btree_iter_rewind(struct btree_iter *iter) -+{ -+ struct bpos pos = bkey_start_pos(&iter->k); -+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS -+ ? bpos_eq(pos, POS_MIN) -+ : bkey_eq(pos, POS_MIN)); -+ -+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ pos = bkey_predecessor(iter, pos); -+ bch2_btree_iter_set_pos(iter, pos); -+ return ret; -+} -+ -+static noinline -+struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) -+{ -+ struct btree_insert_entry *i; -+ struct bkey_i *ret = NULL; -+ -+ trans_for_each_update(iter->trans, i) { -+ if (i->btree_id < iter->btree_id) -+ continue; -+ if (i->btree_id > iter->btree_id) -+ break; -+ if (bpos_lt(i->k->k.p, iter->path->pos)) -+ continue; -+ if (i->key_cache_already_flushed) -+ continue; -+ if (!ret || bpos_lt(i->k->k.p, ret->k.p)) -+ ret = i->k; -+ } -+ -+ return ret; -+} -+ -+static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_WITH_UPDATES -+ ? __bch2_btree_trans_peek_updates(iter) -+ : NULL; -+} -+ -+static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end_pos) -+{ -+ struct bkey_i *k; -+ -+ if (bpos_lt(iter->path->pos, iter->journal_pos)) -+ iter->journal_idx = 0; -+ -+ k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, -+ iter->path->level, -+ iter->path->pos, -+ end_pos, -+ &iter->journal_idx); -+ -+ iter->journal_pos = k ? k->k.p : end_pos; -+ return k; -+} -+ -+static noinline -+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); -+ -+ if (k) { -+ iter->k = k->k; -+ return bkey_i_to_s_c(k); -+ } else { -+ return bkey_s_c_null; -+ } -+} -+ -+static noinline -+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bkey_i *next_journal = -+ bch2_btree_journal_peek(trans, iter, -+ k.k ? k.k->p : path_l(iter->path)->b->key.k.p); -+ -+ if (next_journal) { -+ iter->k = next_journal->k; -+ k = bkey_i_to_s_c(next_journal); -+ } -+ -+ return k; -+} -+ -+/* -+ * Checks btree key cache for key at iter->pos and returns it if present, or -+ * bkey_s_c_null: -+ */ -+static noinline -+struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bch_fs *c = trans->c; -+ struct bkey u; -+ struct bkey_s_c k; -+ int ret; -+ -+ if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && -+ bpos_eq(iter->pos, pos)) -+ return bkey_s_c_null; -+ -+ if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) -+ return bkey_s_c_null; -+ -+ if (!iter->key_cache_path) -+ iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, -+ iter->flags & BTREE_ITER_INTENT, 0, -+ iter->flags|BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL, -+ _THIS_IP_); -+ -+ iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, -+ iter->flags|BTREE_ITER_CACHED) ?: -+ bch2_btree_path_relock(trans, iter->path, _THIS_IP_); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ btree_path_set_should_be_locked(iter->key_cache_path); -+ -+ k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); -+ if (k.k && !bkey_err(k)) { -+ iter->k = u; -+ k.k = &iter->k; -+ } -+ return k; -+} -+ -+static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bkey_i *next_update; -+ struct bkey_s_c k, k2; -+ int ret; -+ -+ EBUG_ON(iter->path->cached); -+ bch2_btree_iter_verify(iter); -+ -+ while (1) { -+ struct btree_path_level *l; -+ -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ if (unlikely(ret)) { -+ /* ensure that iter->k is consistent with iter->pos: */ -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ k = bkey_s_c_err(ret); -+ goto out; -+ } -+ -+ l = path_l(iter->path); -+ -+ if (unlikely(!l->b)) { -+ /* No btree nodes at requested level: */ -+ bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ k = bkey_s_c_null; -+ goto out; -+ } -+ -+ btree_path_set_should_be_locked(iter->path); -+ -+ k = btree_path_level_peek_all(trans->c, l, &iter->k); -+ -+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && -+ k.k && -+ (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { -+ k = k2; -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ goto out; -+ } -+ } -+ -+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) -+ k = btree_trans_peek_journal(trans, iter, k); -+ -+ next_update = btree_trans_peek_updates(iter); -+ -+ if (next_update && -+ bpos_le(next_update->k.p, -+ k.k ? k.k->p : l->b->key.k.p)) { -+ iter->k = next_update->k; -+ k = bkey_i_to_s_c(next_update); -+ } -+ -+ if (k.k && bkey_deleted(k.k)) { -+ /* -+ * If we've got a whiteout, and it's after the search -+ * key, advance the search key to the whiteout instead -+ * of just after the whiteout - it might be a btree -+ * whiteout, with a real key at the same position, since -+ * in the btree deleted keys sort before non deleted. -+ */ -+ search_key = !bpos_eq(search_key, k.k->p) -+ ? k.k->p -+ : bpos_successor(k.k->p); -+ continue; -+ } -+ -+ if (likely(k.k)) { -+ break; -+ } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { -+ /* Advance to next leaf node: */ -+ search_key = bpos_successor(l->b->key.k.p); -+ } else { -+ /* End of btree: */ -+ bch2_btree_iter_set_pos(iter, SPOS_MAX); -+ k = bkey_s_c_null; -+ goto out; -+ } -+ } -+out: -+ bch2_btree_iter_verify(iter); -+ -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's -+ * current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bpos search_key = btree_iter_search_key(iter); -+ struct bkey_s_c k; -+ struct bpos iter_pos; -+ int ret; -+ -+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); -+ EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); -+ -+ if (iter->update_path) { -+ bch2_path_put_nokeep(trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->update_path = NULL; -+ } -+ -+ bch2_btree_iter_verify_entry_exit(iter); -+ -+ while (1) { -+ k = __bch2_btree_iter_peek(iter, search_key); -+ if (unlikely(!k.k)) -+ goto end; -+ if (unlikely(bkey_err(k))) -+ goto out_no_locked; -+ -+ /* -+ * iter->pos should be mononotically increasing, and always be -+ * equal to the key we just returned - except extents can -+ * straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ iter_pos = k.k->p; -+ else -+ iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); -+ -+ if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? bkey_gt(iter_pos, end) -+ : bkey_ge(iter_pos, end))) -+ goto end; -+ -+ if (iter->update_path && -+ !bkey_eq(iter->update_path->pos, k.k->p)) { -+ bch2_path_put_nokeep(trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->update_path = NULL; -+ } -+ -+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && -+ (iter->flags & BTREE_ITER_INTENT) && -+ !(iter->flags & BTREE_ITER_IS_EXTENTS) && -+ !iter->update_path) { -+ struct bpos pos = k.k->p; -+ -+ if (pos.snapshot < iter->snapshot) { -+ search_key = bpos_successor(k.k->p); -+ continue; -+ } -+ -+ pos.snapshot = iter->snapshot; -+ -+ /* -+ * advance, same as on exit for iter->path, but only up -+ * to snapshot -+ */ -+ __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); -+ iter->update_path = iter->path; -+ -+ iter->update_path = bch2_btree_path_set_pos(trans, -+ iter->update_path, pos, -+ iter->flags & BTREE_ITER_INTENT, -+ _THIS_IP_); -+ ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ } -+ -+ /* -+ * We can never have a key in a leaf node at POS_MAX, so -+ * we don't have to check these successor() calls: -+ */ -+ if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && -+ !bch2_snapshot_is_ancestor(trans->c, -+ iter->snapshot, -+ k.k->p.snapshot)) { -+ search_key = bpos_successor(k.k->p); -+ continue; -+ } -+ -+ if (bkey_whiteout(k.k) && -+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { -+ search_key = bkey_successor(iter, k.k->p); -+ continue; -+ } -+ -+ break; -+ } -+ -+ iter->pos = iter_pos; -+ -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ btree_path_set_should_be_locked(iter->path); -+out_no_locked: -+ if (iter->update_path) { -+ ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); -+ if (unlikely(ret)) -+ k = bkey_s_c_err(ret); -+ else -+ btree_path_set_should_be_locked(iter->update_path); -+ } -+ -+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) -+ iter->pos.snapshot = iter->snapshot; -+ -+ ret = bch2_btree_iter_verify_ret(iter, k); -+ if (unlikely(ret)) { -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ k = bkey_s_c_err(ret); -+ } -+ -+ bch2_btree_iter_verify_entry_exit(iter); -+ -+ return k; -+end: -+ bch2_btree_iter_set_pos(iter, end); -+ k = bkey_s_c_null; -+ goto out_no_locked; -+} -+ -+/** -+ * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal -+ * to iterator's current position, returning keys from every level of the btree. -+ * For keys at different levels of the btree that compare equal, the key from -+ * the lower level (leaf) is returned first. -+ */ -+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(iter->path->cached); -+ bch2_btree_iter_verify(iter); -+ BUG_ON(iter->path->level < iter->min_depth); -+ BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); -+ EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); -+ -+ while (1) { -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ if (unlikely(ret)) { -+ /* ensure that iter->k is consistent with iter->pos: */ -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ -+ /* Already at end? */ -+ if (!btree_path_node(iter->path, iter->path->level)) { -+ k = bkey_s_c_null; -+ goto out_no_locked; -+ } -+ -+ k = btree_path_level_peek_all(trans->c, -+ &iter->path->l[iter->path->level], &iter->k); -+ -+ /* Check if we should go up to the parent node: */ -+ if (!k.k || -+ (iter->advanced && -+ bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { -+ iter->pos = path_l(iter->path)->b->key.k.p; -+ btree_path_set_level_up(trans, iter->path); -+ iter->advanced = false; -+ continue; -+ } -+ -+ /* -+ * Check if we should go back down to a leaf: -+ * If we're not in a leaf node, we only return the current key -+ * if it exactly matches iter->pos - otherwise we first have to -+ * go back to the leaf: -+ */ -+ if (iter->path->level != iter->min_depth && -+ (iter->advanced || -+ !k.k || -+ !bpos_eq(iter->pos, k.k->p))) { -+ btree_path_set_level_down(trans, iter->path, iter->min_depth); -+ iter->pos = bpos_successor(iter->pos); -+ iter->advanced = false; -+ continue; -+ } -+ -+ /* Check if we should go to the next key: */ -+ if (iter->path->level == iter->min_depth && -+ iter->advanced && -+ k.k && -+ bpos_eq(iter->pos, k.k->p)) { -+ iter->pos = bpos_successor(iter->pos); -+ iter->advanced = false; -+ continue; -+ } -+ -+ if (iter->advanced && -+ iter->path->level == iter->min_depth && -+ !bpos_eq(k.k->p, iter->pos)) -+ iter->advanced = false; -+ -+ BUG_ON(iter->advanced); -+ BUG_ON(!k.k); -+ break; -+ } -+ -+ iter->pos = k.k->p; -+ btree_path_set_should_be_locked(iter->path); -+out_no_locked: -+ bch2_btree_iter_verify(iter); -+ -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_next: returns first key greater than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) -+{ -+ if (!bch2_btree_iter_advance(iter)) -+ return bkey_s_c_null; -+ -+ return bch2_btree_iter_peek(iter); -+} -+ -+/** -+ * bch2_btree_iter_peek_prev: returns first key less than or equal to -+ * iterator's current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bpos search_key = iter->pos; -+ struct btree_path *saved_path = NULL; -+ struct bkey_s_c k; -+ struct bkey saved_k; -+ const struct bch_val *saved_v; -+ int ret; -+ -+ EBUG_ON(iter->path->cached || iter->path->level); -+ EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); -+ -+ if (iter->flags & BTREE_ITER_WITH_JOURNAL) -+ return bkey_s_c_err(-EIO); -+ -+ bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify_entry_exit(iter); -+ -+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) -+ search_key.snapshot = U32_MAX; -+ -+ while (1) { -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ if (unlikely(ret)) { -+ /* ensure that iter->k is consistent with iter->pos: */ -+ bch2_btree_iter_set_pos(iter, iter->pos); -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ -+ k = btree_path_level_peek(trans, iter->path, -+ &iter->path->l[0], &iter->k); -+ if (!k.k || -+ ((iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? bpos_ge(bkey_start_pos(k.k), search_key) -+ : bpos_gt(k.k->p, search_key))) -+ k = btree_path_level_prev(trans, iter->path, -+ &iter->path->l[0], &iter->k); -+ -+ if (likely(k.k)) { -+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { -+ if (k.k->p.snapshot == iter->snapshot) -+ goto got_key; -+ -+ /* -+ * If we have a saved candidate, and we're no -+ * longer at the same _key_ (not pos), return -+ * that candidate -+ */ -+ if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { -+ bch2_path_put_nokeep(trans, iter->path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->path = saved_path; -+ saved_path = NULL; -+ iter->k = saved_k; -+ k.v = saved_v; -+ goto got_key; -+ } -+ -+ if (bch2_snapshot_is_ancestor(iter->trans->c, -+ iter->snapshot, -+ k.k->p.snapshot)) { -+ if (saved_path) -+ bch2_path_put_nokeep(trans, saved_path, -+ iter->flags & BTREE_ITER_INTENT); -+ saved_path = btree_path_clone(trans, iter->path, -+ iter->flags & BTREE_ITER_INTENT); -+ saved_k = *k.k; -+ saved_v = k.v; -+ } -+ -+ search_key = bpos_predecessor(k.k->p); -+ continue; -+ } -+got_key: -+ if (bkey_whiteout(k.k) && -+ !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { -+ search_key = bkey_predecessor(iter, k.k->p); -+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) -+ search_key.snapshot = U32_MAX; -+ continue; -+ } -+ -+ break; -+ } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { -+ /* Advance to previous leaf node: */ -+ search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); -+ } else { -+ /* Start of btree: */ -+ bch2_btree_iter_set_pos(iter, POS_MIN); -+ k = bkey_s_c_null; -+ goto out_no_locked; -+ } -+ } -+ -+ EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); -+ -+ /* Extents can straddle iter->pos: */ -+ if (bkey_lt(k.k->p, iter->pos)) -+ iter->pos = k.k->p; -+ -+ if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) -+ iter->pos.snapshot = iter->snapshot; -+ -+ btree_path_set_should_be_locked(iter->path); -+out_no_locked: -+ if (saved_path) -+ bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); -+ -+ bch2_btree_iter_verify_entry_exit(iter); -+ bch2_btree_iter_verify(iter); -+ -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_prev: returns first key less than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) -+{ -+ if (!bch2_btree_iter_rewind(iter)) -+ return bkey_s_c_null; -+ -+ return bch2_btree_iter_peek_prev(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bpos search_key; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_btree_iter_verify(iter); -+ bch2_btree_iter_verify_entry_exit(iter); -+ EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); -+ EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); -+ -+ /* extents can't span inode numbers: */ -+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { -+ if (iter->pos.inode == KEY_INODE_MAX) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); -+ } -+ -+ search_key = btree_iter_search_key(iter); -+ iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, -+ iter->flags & BTREE_ITER_INTENT, -+ btree_iter_ip_allocated(iter)); -+ -+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); -+ if (unlikely(ret)) { -+ k = bkey_s_c_err(ret); -+ goto out_no_locked; -+ } -+ -+ if ((iter->flags & BTREE_ITER_CACHED) || -+ !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { -+ struct bkey_i *next_update; -+ -+ if ((next_update = btree_trans_peek_updates(iter)) && -+ bpos_eq(next_update->k.p, iter->pos)) { -+ iter->k = next_update->k; -+ k = bkey_i_to_s_c(next_update); -+ goto out; -+ } -+ -+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && -+ (k = btree_trans_peek_slot_journal(trans, iter)).k) -+ goto out; -+ -+ if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && -+ (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { -+ if (!bkey_err(k)) -+ iter->k = *k.k; -+ /* We're not returning a key from iter->path: */ -+ goto out_no_locked; -+ } -+ -+ k = bch2_btree_path_peek_slot(iter->path, &iter->k); -+ if (unlikely(!k.k)) -+ goto out_no_locked; -+ } else { -+ struct bpos next; -+ struct bpos end = iter->pos; -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ end.offset = U64_MAX; -+ -+ EBUG_ON(iter->path->level); -+ -+ if (iter->flags & BTREE_ITER_INTENT) { -+ struct btree_iter iter2; -+ -+ bch2_trans_copy_iter(&iter2, iter); -+ k = bch2_btree_iter_peek_upto(&iter2, end); -+ -+ if (k.k && !bkey_err(k)) { -+ iter->k = iter2.k; -+ k.k = &iter->k; -+ } -+ bch2_trans_iter_exit(trans, &iter2); -+ } else { -+ struct bpos pos = iter->pos; -+ -+ k = bch2_btree_iter_peek_upto(iter, end); -+ if (unlikely(bkey_err(k))) -+ bch2_btree_iter_set_pos(iter, pos); -+ else -+ iter->pos = pos; -+ } -+ -+ if (unlikely(bkey_err(k))) -+ goto out_no_locked; -+ -+ next = k.k ? bkey_start_pos(k.k) : POS_MAX; -+ -+ if (bkey_lt(iter->pos, next)) { -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos; -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) { -+ bch2_key_resize(&iter->k, -+ min_t(u64, KEY_SIZE_MAX, -+ (next.inode == iter->pos.inode -+ ? next.offset -+ : KEY_OFFSET_MAX) - -+ iter->pos.offset)); -+ EBUG_ON(!iter->k.size); -+ } -+ -+ k = (struct bkey_s_c) { &iter->k, NULL }; -+ } -+ } -+out: -+ btree_path_set_should_be_locked(iter->path); -+out_no_locked: -+ bch2_btree_iter_verify_entry_exit(iter); -+ bch2_btree_iter_verify(iter); -+ ret = bch2_btree_iter_verify_ret(iter, k); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) -+{ -+ if (!bch2_btree_iter_advance(iter)) -+ return bkey_s_c_null; -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) -+{ -+ if (!bch2_btree_iter_rewind(iter)) -+ return bkey_s_c_null; -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) -+{ -+ struct bkey_s_c k; -+ -+ while (btree_trans_too_many_iters(iter->trans) || -+ (k = bch2_btree_iter_peek_type(iter, iter->flags), -+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) -+ bch2_trans_begin(iter->trans); -+ -+ return k; -+} -+ -+/* new transactional stuff: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static void btree_trans_verify_sorted_refs(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ unsigned i; -+ -+ BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); -+ -+ trans_for_each_path(trans, path) { -+ BUG_ON(path->sorted_idx >= trans->nr_sorted); -+ BUG_ON(trans->sorted[path->sorted_idx] != path->idx); -+ } -+ -+ for (i = 0; i < trans->nr_sorted; i++) { -+ unsigned idx = trans->sorted[i]; -+ -+ EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); -+ BUG_ON(trans->paths[idx].sorted_idx != i); -+ } -+} -+ -+static void btree_trans_verify_sorted(struct btree_trans *trans) -+{ -+ struct btree_path *path, *prev = NULL; -+ unsigned i; -+ -+ if (!bch2_debug_check_iterators) -+ return; -+ -+ trans_for_each_path_inorder(trans, path, i) { -+ if (prev && btree_path_cmp(prev, path) > 0) { -+ __bch2_dump_trans_paths_updates(trans, true); -+ panic("trans paths out of order!\n"); -+ } -+ prev = path; -+ } -+} -+#else -+static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} -+static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} -+#endif -+ -+void __bch2_btree_trans_sort_paths(struct btree_trans *trans) -+{ -+ int i, l = 0, r = trans->nr_sorted, inc = 1; -+ bool swapped; -+ -+ btree_trans_verify_sorted_refs(trans); -+ -+ if (trans->paths_sorted) -+ goto out; -+ -+ /* -+ * Cocktail shaker sort: this is efficient because iterators will be -+ * mostly sorted. -+ */ -+ do { -+ swapped = false; -+ -+ for (i = inc > 0 ? l : r - 2; -+ i + 1 < r && i >= l; -+ i += inc) { -+ if (btree_path_cmp(trans->paths + trans->sorted[i], -+ trans->paths + trans->sorted[i + 1]) > 0) { -+ swap(trans->sorted[i], trans->sorted[i + 1]); -+ trans->paths[trans->sorted[i]].sorted_idx = i; -+ trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; -+ swapped = true; -+ } -+ } -+ -+ if (inc > 0) -+ --r; -+ else -+ l++; -+ inc = -inc; -+ } while (swapped); -+ -+ trans->paths_sorted = true; -+out: -+ btree_trans_verify_sorted(trans); -+} -+ -+static inline void btree_path_list_remove(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ unsigned i; -+ -+ EBUG_ON(path->sorted_idx >= trans->nr_sorted); -+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -+ trans->nr_sorted--; -+ memmove_u64s_down_small(trans->sorted + path->sorted_idx, -+ trans->sorted + path->sorted_idx + 1, -+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); -+#else -+ array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); -+#endif -+ for (i = path->sorted_idx; i < trans->nr_sorted; i++) -+ trans->paths[trans->sorted[i]].sorted_idx = i; -+ -+ path->sorted_idx = U8_MAX; -+} -+ -+static inline void btree_path_list_add(struct btree_trans *trans, -+ struct btree_path *pos, -+ struct btree_path *path) -+{ -+ unsigned i; -+ -+ path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; -+ -+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS -+ memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, -+ trans->sorted + path->sorted_idx, -+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); -+ trans->nr_sorted++; -+ trans->sorted[path->sorted_idx] = path->idx; -+#else -+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); -+#endif -+ -+ for (i = path->sorted_idx; i < trans->nr_sorted; i++) -+ trans->paths[trans->sorted[i]].sorted_idx = i; -+ -+ btree_trans_verify_sorted_refs(trans); -+} -+ -+void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) -+{ -+ if (iter->update_path) -+ bch2_path_put_nokeep(trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); -+ if (iter->path) -+ bch2_path_put(trans, iter->path, -+ iter->flags & BTREE_ITER_INTENT); -+ if (iter->key_cache_path) -+ bch2_path_put(trans, iter->key_cache_path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->path = NULL; -+ iter->update_path = NULL; -+ iter->key_cache_path = NULL; -+} -+ -+void bch2_trans_iter_init_outlined(struct btree_trans *trans, -+ struct btree_iter *iter, -+ enum btree_id btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, -+ bch2_btree_iter_flags(trans, btree_id, flags), -+ _RET_IP_); -+} -+ -+void bch2_trans_node_iter_init(struct btree_trans *trans, -+ struct btree_iter *iter, -+ enum btree_id btree_id, -+ struct bpos pos, -+ unsigned locks_want, -+ unsigned depth, -+ unsigned flags) -+{ -+ flags |= BTREE_ITER_NOT_EXTENTS; -+ flags |= __BTREE_ITER_ALL_SNAPSHOTS; -+ flags |= BTREE_ITER_ALL_SNAPSHOTS; -+ -+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, -+ __bch2_btree_iter_flags(trans, btree_id, flags), -+ _RET_IP_); -+ -+ iter->min_depth = depth; -+ -+ BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); -+ BUG_ON(iter->path->level != depth); -+ BUG_ON(iter->min_depth != depth); -+} -+ -+void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) -+{ -+ *dst = *src; -+ if (src->path) -+ __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); -+ if (src->update_path) -+ __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); -+ dst->key_cache_path = NULL; -+} -+ -+void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ unsigned new_top = trans->mem_top + size; -+ size_t old_bytes = trans->mem_bytes; -+ size_t new_bytes = roundup_pow_of_two(new_top); -+ int ret; -+ void *new_mem; -+ void *p; -+ -+ trans->mem_max = max(trans->mem_max, new_top); -+ -+ WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); -+ -+ new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); -+ if (unlikely(!new_mem)) { -+ bch2_trans_unlock(trans); -+ -+ new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); -+ if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { -+ new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); -+ new_bytes = BTREE_TRANS_MEM_MAX; -+ kfree(trans->mem); -+ } -+ -+ if (!new_mem) -+ return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); -+ -+ trans->mem = new_mem; -+ trans->mem_bytes = new_bytes; -+ -+ ret = bch2_trans_relock(trans); -+ if (ret) -+ return ERR_PTR(ret); -+ } -+ -+ trans->mem = new_mem; -+ trans->mem_bytes = new_bytes; -+ -+ if (old_bytes) { -+ trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); -+ return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); -+ } -+ -+ p = trans->mem + trans->mem_top; -+ trans->mem_top += size; -+ memset(p, 0, size); -+ return p; -+} -+ -+static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ if (path->cached && !btree_node_locked(path, 0)) -+ path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); -+ -+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); -+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -+ trans->srcu_lock_time = jiffies; -+} -+ -+/** -+ * bch2_trans_begin() - reset a transaction after a interrupted attempt -+ * @trans: transaction to reset -+ * -+ * While iterating over nodes or updating nodes a attempt to lock a btree node -+ * may return BCH_ERR_transaction_restart when the trylock fails. When this -+ * occurs bch2_trans_begin() should be called and the transaction retried. -+ */ -+u32 bch2_trans_begin(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ u64 now; -+ -+ bch2_trans_reset_updates(trans); -+ -+ trans->restart_count++; -+ trans->mem_top = 0; -+ -+ trans_for_each_path(trans, path) { -+ path->should_be_locked = false; -+ -+ /* -+ * If the transaction wasn't restarted, we're presuming to be -+ * doing something new: dont keep iterators excpt the ones that -+ * are in use - except for the subvolumes btree: -+ */ -+ if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) -+ path->preserve = false; -+ -+ /* -+ * XXX: we probably shouldn't be doing this if the transaction -+ * was restarted, but currently we still overflow transaction -+ * iterators if we do that -+ */ -+ if (!path->ref && !path->preserve) -+ __bch2_path_free(trans, path); -+ else -+ path->preserve = false; -+ } -+ -+ now = local_clock(); -+ if (!trans->restarted && -+ (need_resched() || -+ now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { -+ drop_locks_do(trans, (cond_resched(), 0)); -+ now = local_clock(); -+ } -+ trans->last_begin_time = now; -+ -+ if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) -+ bch2_trans_reset_srcu_lock(trans); -+ -+ trans->last_begin_ip = _RET_IP_; -+ if (trans->restarted) { -+ bch2_btree_path_traverse_all(trans); -+ trans->notrace_relock_fail = false; -+ } -+ -+ return trans->restart_count; -+} -+ -+static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) -+{ -+ size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; -+ size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; -+ void *p = NULL; -+ -+ BUG_ON(trans->used_mempool); -+ -+#ifdef __KERNEL__ -+ p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); -+#endif -+ if (!p) { -+ p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); -+ /* -+ * paths need to be zeroed, bch2_check_for_deadlock looks at -+ * paths in other threads -+ */ -+ memset(p, 0, paths_bytes); -+ } -+ -+ trans->paths = p; p += paths_bytes; -+ trans->updates = p; p += updates_bytes; -+} -+ -+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; -+ -+unsigned bch2_trans_get_fn_idx(const char *fn) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) -+ if (!bch2_btree_transaction_fns[i] || -+ bch2_btree_transaction_fns[i] == fn) { -+ bch2_btree_transaction_fns[i] = fn; -+ return i; -+ } -+ -+ pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); -+ return i; -+} -+ -+void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) -+ __acquires(&c->btree_trans_barrier) -+{ -+ struct btree_transaction_stats *s; -+ -+ bch2_assert_btree_nodes_not_locked(); -+ -+ memset(trans, 0, sizeof(*trans)); -+ trans->c = c; -+ trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) -+ ? bch2_btree_transaction_fns[fn_idx] : NULL; -+ trans->last_begin_time = local_clock(); -+ trans->fn_idx = fn_idx; -+ trans->locking_wait.task = current; -+ trans->journal_replay_not_finished = -+ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); -+ closure_init_stack(&trans->ref); -+ -+ bch2_trans_alloc_paths(trans, c); -+ -+ s = btree_trans_stats(trans); -+ if (s && s->max_mem) { -+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); -+ -+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); -+ -+ if (!unlikely(trans->mem)) { -+ trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); -+ trans->mem_bytes = BTREE_TRANS_MEM_MAX; -+ } else { -+ trans->mem_bytes = expected_mem_bytes; -+ } -+ } -+ -+ if (s) { -+ trans->nr_max_paths = s->nr_max_paths; -+ trans->wb_updates_size = s->wb_updates_size; -+ } -+ -+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -+ trans->srcu_lock_time = jiffies; -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { -+ struct btree_trans *pos; -+ -+ seqmutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(pos, &c->btree_trans_list, list) { -+ /* -+ * We'd much prefer to be stricter here and completely -+ * disallow multiple btree_trans in the same thread - -+ * but the data move path calls bch2_write when we -+ * already have a btree_trans initialized. -+ */ -+ BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && -+ bch2_trans_locked(pos)); -+ -+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { -+ list_add_tail(&trans->list, &pos->list); -+ goto list_add_done; -+ } -+ } -+ list_add_tail(&trans->list, &c->btree_trans_list); -+list_add_done: -+ seqmutex_unlock(&c->btree_trans_lock); -+ } -+} -+ -+static void check_btree_paths_leaked(struct btree_trans *trans) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ if (path->ref) -+ goto leaked; -+ return; -+leaked: -+ bch_err(c, "btree paths leaked from %s!", trans->fn); -+ trans_for_each_path(trans, path) -+ if (path->ref) -+ printk(KERN_ERR " btree %s %pS\n", -+ bch2_btree_ids[path->btree_id], -+ (void *) path->ip_allocated); -+ /* Be noisy about this: */ -+ bch2_fatal_error(c); -+#endif -+} -+ -+void bch2_trans_exit(struct btree_trans *trans) -+ __releases(&c->btree_trans_barrier) -+{ -+ struct btree_insert_entry *i; -+ struct bch_fs *c = trans->c; -+ struct btree_transaction_stats *s = btree_trans_stats(trans); -+ -+ bch2_trans_unlock(trans); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { -+ seqmutex_lock(&c->btree_trans_lock); -+ list_del(&trans->list); -+ seqmutex_unlock(&c->btree_trans_lock); -+ } -+ -+ closure_sync(&trans->ref); -+ -+ if (s) -+ s->max_mem = max(s->max_mem, trans->mem_max); -+ -+ trans_for_each_update(trans, i) -+ __btree_path_put(i->path, true); -+ trans->nr_updates = 0; -+ -+ check_btree_paths_leaked(trans); -+ -+ srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); -+ -+ bch2_journal_preres_put(&c->journal, &trans->journal_preres); -+ -+ kfree(trans->extra_journal_entries.data); -+ -+ if (trans->fs_usage_deltas) { -+ if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == -+ REPLICAS_DELTA_LIST_MAX) -+ mempool_free(trans->fs_usage_deltas, -+ &c->replicas_delta_pool); -+ else -+ kfree(trans->fs_usage_deltas); -+ } -+ -+ if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) -+ mempool_free(trans->mem, &c->btree_trans_mem_pool); -+ else -+ kfree(trans->mem); -+ -+#ifdef __KERNEL__ -+ /* -+ * Userspace doesn't have a real percpu implementation: -+ */ -+ trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); -+#endif -+ -+ if (trans->paths) -+ mempool_free(trans->paths, &c->btree_paths_pool); -+ -+ trans->mem = (void *) 0x1; -+ trans->paths = (void *) 0x1; -+} -+ -+static void __maybe_unused -+bch2_btree_bkey_cached_common_to_text(struct printbuf *out, -+ struct btree_bkey_cached_common *b) -+{ -+ struct six_lock_count c = six_lock_counts(&b->lock); -+ struct task_struct *owner; -+ pid_t pid; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(b->lock.owner); -+ pid = owner ? owner->pid : 0; -+ rcu_read_unlock(); -+ -+ prt_tab(out); -+ prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', -+ b->level, bch2_btree_ids[b->btree_id]); -+ bch2_bpos_to_text(out, btree_node_pos(b)); -+ -+ prt_tab(out); -+ prt_printf(out, " locks %u:%u:%u held by pid %u", -+ c.n[0], c.n[1], c.n[2], pid); -+} -+ -+void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ struct btree_bkey_cached_common *b; -+ static char lock_types[] = { 'r', 'i', 'w' }; -+ unsigned l, idx; -+ -+ if (!out->nr_tabstops) { -+ printbuf_tabstop_push(out, 16); -+ printbuf_tabstop_push(out, 32); -+ } -+ -+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); -+ -+ trans_for_each_path_safe(trans, path, idx) { -+ if (!path->nodes_locked) -+ continue; -+ -+ prt_printf(out, " path %u %c l=%u %s:", -+ path->idx, -+ path->cached ? 'c' : 'b', -+ path->level, -+ bch2_btree_ids[path->btree_id]); -+ bch2_bpos_to_text(out, path->pos); -+ prt_newline(out); -+ -+ for (l = 0; l < BTREE_MAX_DEPTH; l++) { -+ if (btree_node_locked(path, l) && -+ !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { -+ prt_printf(out, " %c l=%u ", -+ lock_types[btree_node_locked_type(path, l)], l); -+ bch2_btree_bkey_cached_common_to_text(out, b); -+ prt_newline(out); -+ } -+ } -+ } -+ -+ b = READ_ONCE(trans->locking); -+ if (b) { -+ prt_printf(out, " blocked for %lluus on", -+ div_u64(local_clock() - trans->locking_wait.start_time, -+ 1000)); -+ prt_newline(out); -+ prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); -+ bch2_btree_bkey_cached_common_to_text(out, b); -+ prt_newline(out); -+ } -+} -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *c) -+{ -+ struct btree_transaction_stats *s; -+ -+ for (s = c->btree_transaction_stats; -+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); -+ s++) { -+ kfree(s->max_paths_text); -+ bch2_time_stats_exit(&s->lock_hold_times); -+ } -+ -+ if (c->btree_trans_barrier_initialized) -+ cleanup_srcu_struct(&c->btree_trans_barrier); -+ mempool_exit(&c->btree_trans_mem_pool); -+ mempool_exit(&c->btree_paths_pool); -+} -+ -+int bch2_fs_btree_iter_init(struct bch_fs *c) -+{ -+ struct btree_transaction_stats *s; -+ unsigned nr = BTREE_ITER_MAX; -+ int ret; -+ -+ for (s = c->btree_transaction_stats; -+ s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); -+ s++) { -+ bch2_time_stats_init(&s->lock_hold_times); -+ mutex_init(&s->lock); -+ } -+ -+ INIT_LIST_HEAD(&c->btree_trans_list); -+ seqmutex_init(&c->btree_trans_lock); -+ -+ ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, -+ sizeof(struct btree_path) * nr + -+ sizeof(struct btree_insert_entry) * nr) ?: -+ mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, -+ BTREE_TRANS_MEM_MAX) ?: -+ init_srcu_struct(&c->btree_trans_barrier); -+ if (!ret) -+ c->btree_trans_barrier_initialized = true; -+ return ret; -+} -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -new file mode 100644 -index 000000000..4469b2e16 ---- /dev/null -+++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,940 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_ITER_H -+#define _BCACHEFS_BTREE_ITER_H -+ -+#include "bset.h" -+#include "btree_types.h" -+#include "trace.h" -+ -+static inline int __bkey_err(const struct bkey *k) -+{ -+ return PTR_ERR_OR_ZERO(k); -+} -+ -+#define bkey_err(_k) __bkey_err((_k).k) -+ -+static inline void __btree_path_get(struct btree_path *path, bool intent) -+{ -+ path->ref++; -+ path->intent_ref += intent; -+} -+ -+static inline bool __btree_path_put(struct btree_path *path, bool intent) -+{ -+ EBUG_ON(!path->ref); -+ EBUG_ON(!path->intent_ref && intent); -+ path->intent_ref -= intent; -+ return --path->ref == 0; -+} -+ -+static inline void btree_path_set_dirty(struct btree_path *path, -+ enum btree_path_uptodate u) -+{ -+ path->uptodate = max_t(unsigned, path->uptodate, u); -+} -+ -+static inline struct btree *btree_path_node(struct btree_path *path, -+ unsigned level) -+{ -+ return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; -+} -+ -+static inline bool btree_node_lock_seq_matches(const struct btree_path *path, -+ const struct btree *b, unsigned level) -+{ -+ return path->l[level].lock_seq == six_lock_seq(&b->c.lock); -+} -+ -+static inline struct btree *btree_node_parent(struct btree_path *path, -+ struct btree *b) -+{ -+ return btree_path_node(path, b->c.level + 1); -+} -+ -+/* Iterate over paths within a transaction: */ -+ -+void __bch2_btree_trans_sort_paths(struct btree_trans *); -+ -+static inline void btree_trans_sort_paths(struct btree_trans *trans) -+{ -+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -+ trans->paths_sorted) -+ return; -+ __bch2_btree_trans_sort_paths(trans); -+} -+ -+static inline struct btree_path * -+__trans_next_path(struct btree_trans *trans, unsigned idx) -+{ -+ u64 l; -+ -+ if (idx == BTREE_ITER_MAX) -+ return NULL; -+ -+ l = trans->paths_allocated >> idx; -+ if (!l) -+ return NULL; -+ -+ idx += __ffs64(l); -+ EBUG_ON(idx >= BTREE_ITER_MAX); -+ EBUG_ON(trans->paths[idx].idx != idx); -+ return &trans->paths[idx]; -+} -+ -+#define trans_for_each_path_from(_trans, _path, _start) \ -+ for (_path = __trans_next_path((_trans), _start); \ -+ (_path); \ -+ _path = __trans_next_path((_trans), (_path)->idx + 1)) -+ -+#define trans_for_each_path(_trans, _path) \ -+ trans_for_each_path_from(_trans, _path, 0) -+ -+static inline struct btree_path * -+__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) -+{ -+ u64 l; -+ -+ if (*idx == BTREE_ITER_MAX) -+ return NULL; -+ -+ l = trans->paths_allocated >> *idx; -+ if (!l) -+ return NULL; -+ -+ *idx += __ffs64(l); -+ EBUG_ON(*idx >= BTREE_ITER_MAX); -+ return &trans->paths[*idx]; -+} -+ -+/* -+ * This version is intended to be safe for use on a btree_trans that is owned by -+ * another thread, for bch2_btree_trans_to_text(); -+ */ -+#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ -+ for (_idx = _start; \ -+ (_path = __trans_next_path_safe((_trans), &_idx)); \ -+ _idx++) -+ -+#define trans_for_each_path_safe(_trans, _path, _idx) \ -+ trans_for_each_path_safe_from(_trans, _path, _idx, 0) -+ -+static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) -+{ -+ unsigned idx = path ? path->sorted_idx + 1 : 0; -+ -+ EBUG_ON(idx > trans->nr_sorted); -+ -+ return idx < trans->nr_sorted -+ ? trans->paths + trans->sorted[idx] -+ : NULL; -+} -+ -+static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) -+{ -+ unsigned idx = path ? path->sorted_idx : trans->nr_sorted; -+ -+ return idx -+ ? trans->paths + trans->sorted[idx - 1] -+ : NULL; -+} -+ -+#define trans_for_each_path_inorder(_trans, _path, _i) \ -+ for (_i = 0; \ -+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ -+ _i++) -+ -+#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ -+ for (_i = trans->nr_sorted - 1; \ -+ ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ -+ --_i) -+ -+static inline bool __path_has_node(const struct btree_path *path, -+ const struct btree *b) -+{ -+ return path->l[b->c.level].b == b && -+ btree_node_lock_seq_matches(path, b, b->c.level); -+} -+ -+static inline struct btree_path * -+__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, -+ unsigned idx) -+{ -+ struct btree_path *path = __trans_next_path(trans, idx); -+ -+ while (path && !__path_has_node(path, b)) -+ path = __trans_next_path(trans, path->idx + 1); -+ -+ return path; -+} -+ -+#define trans_for_each_path_with_node(_trans, _b, _path) \ -+ for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ -+ (_path); \ -+ _path = __trans_next_path_with_node((_trans), (_b), \ -+ (_path)->idx + 1)) -+ -+struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, -+ bool, unsigned long); -+ -+static inline struct btree_path * __must_check -+bch2_btree_path_make_mut(struct btree_trans *trans, -+ struct btree_path *path, bool intent, -+ unsigned long ip) -+{ -+ if (path->ref > 1 || path->preserve) -+ path = __bch2_btree_path_make_mut(trans, path, intent, ip); -+ path->should_be_locked = false; -+ return path; -+} -+ -+struct btree_path * __must_check -+__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, -+ struct bpos, bool, unsigned long, int); -+ -+static inline struct btree_path * __must_check -+bch2_btree_path_set_pos(struct btree_trans *trans, -+ struct btree_path *path, struct bpos new_pos, -+ bool intent, unsigned long ip) -+{ -+ int cmp = bpos_cmp(new_pos, path->pos); -+ -+ return cmp -+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) -+ : path; -+} -+ -+int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, -+ unsigned, unsigned long); -+ -+static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, -+ struct btree_path *path, unsigned flags) -+{ -+ if (path->uptodate < BTREE_ITER_NEED_RELOCK) -+ return 0; -+ -+ return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); -+} -+ -+int __must_check bch2_btree_path_traverse(struct btree_trans *, -+ struct btree_path *, unsigned); -+struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, -+ unsigned, unsigned, unsigned, unsigned long); -+struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); -+ -+/* -+ * bch2_btree_path_peek_slot() for a cached iterator might return a key in a -+ * different snapshot: -+ */ -+static inline struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) -+{ -+ struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); -+ -+ if (k.k && bpos_eq(path->pos, k.k->p)) -+ return k; -+ -+ bkey_init(u); -+ u->p = path->pos; -+ return (struct bkey_s_c) { u, NULL }; -+} -+ -+struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, -+ struct btree_iter *, struct bpos); -+ -+void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); -+ -+int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); -+ -+static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) -+{ -+ return mutex_trylock(lock) -+ ? 0 -+ : __bch2_trans_mutex_lock(trans, lock); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_trans_verify_paths(struct btree_trans *); -+void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, -+ struct bpos, bool); -+#else -+static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -+static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, -+ struct bpos pos, bool key_cache) {} -+#endif -+ -+void bch2_btree_path_fix_key_modified(struct btree_trans *trans, -+ struct btree *, struct bkey_packed *); -+void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, -+ struct btree *, struct btree_node_iter *, -+ struct bkey_packed *, unsigned, unsigned); -+ -+int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); -+ -+void bch2_path_put(struct btree_trans *, struct btree_path *, bool); -+ -+int bch2_trans_relock(struct btree_trans *); -+int bch2_trans_relock_notrace(struct btree_trans *); -+void bch2_trans_unlock(struct btree_trans *); -+bool bch2_trans_locked(struct btree_trans *); -+ -+static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) -+{ -+ return restart_count != trans->restart_count; -+} -+ -+void __noreturn bch2_trans_restart_error(struct btree_trans *, u32); -+ -+static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, -+ u32 restart_count) -+{ -+ if (trans_was_restarted(trans, restart_count)) -+ bch2_trans_restart_error(trans, restart_count); -+} -+ -+void __noreturn bch2_trans_in_restart_error(struct btree_trans *); -+ -+static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) -+{ -+ if (trans->restarted) -+ bch2_trans_in_restart_error(trans); -+} -+ -+__always_inline -+static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) -+{ -+ BUG_ON(err <= 0); -+ BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); -+ -+ trans->restarted = err; -+ trans->last_restarted_ip = _THIS_IP_; -+ return -err; -+} -+ -+__always_inline -+static int btree_trans_restart(struct btree_trans *trans, int err) -+{ -+ btree_trans_restart_nounlock(trans, err); -+ return -err; -+} -+ -+bool bch2_btree_node_upgrade(struct btree_trans *, -+ struct btree_path *, unsigned); -+ -+void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); -+ -+static inline void bch2_btree_path_downgrade(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ unsigned new_locks_want = path->level + !!path->intent_ref; -+ -+ if (path->locks_want > new_locks_want) -+ __bch2_btree_path_downgrade(trans, path, new_locks_want); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *); -+ -+void bch2_trans_node_add(struct btree_trans *trans, struct btree *); -+void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); -+int __must_check bch2_btree_iter_traverse(struct btree_iter *); -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -+struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); -+struct btree *bch2_btree_iter_next_node(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); -+ -+static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -+{ -+ return bch2_btree_iter_peek_upto(iter, SPOS_MAX); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); -+ -+bool bch2_btree_iter_advance(struct btree_iter *); -+bool bch2_btree_iter_rewind(struct btree_iter *); -+ -+static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+{ -+ iter->k.type = KEY_TYPE_deleted; -+ iter->k.p.inode = iter->pos.inode = new_pos.inode; -+ iter->k.p.offset = iter->pos.offset = new_pos.offset; -+ iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; -+ iter->k.size = 0; -+} -+ -+static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+{ -+ if (unlikely(iter->update_path)) -+ bch2_path_put(iter->trans, iter->update_path, -+ iter->flags & BTREE_ITER_INTENT); -+ iter->update_path = NULL; -+ -+ if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) -+ new_pos.snapshot = iter->snapshot; -+ -+ __bch2_btree_iter_set_pos(iter, new_pos); -+} -+ -+static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) -+{ -+ BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); -+ iter->pos = bkey_start_pos(&iter->k); -+} -+ -+static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) -+{ -+ struct bpos pos = iter->pos; -+ -+ iter->snapshot = snapshot; -+ pos.snapshot = snapshot; -+ bch2_btree_iter_set_pos(iter, pos); -+} -+ -+void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); -+ -+static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, -+ unsigned btree_id, -+ unsigned flags) -+{ -+ if (flags & BTREE_ITER_ALL_LEVELS) -+ flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; -+ -+ if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && -+ btree_node_type_is_extents(btree_id)) -+ flags |= BTREE_ITER_IS_EXTENTS; -+ -+ if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && -+ !btree_type_has_snapshots(btree_id)) -+ flags &= ~BTREE_ITER_ALL_SNAPSHOTS; -+ -+ if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && -+ btree_type_has_snapshots(btree_id)) -+ flags |= BTREE_ITER_FILTER_SNAPSHOTS; -+ -+ if (trans->journal_replay_not_finished) -+ flags |= BTREE_ITER_WITH_JOURNAL; -+ -+ return flags; -+} -+ -+static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, -+ unsigned btree_id, -+ unsigned flags) -+{ -+ if (!btree_id_cached(trans->c, btree_id)) { -+ flags &= ~BTREE_ITER_CACHED; -+ flags &= ~BTREE_ITER_WITH_KEY_CACHE; -+ } else if (!(flags & BTREE_ITER_CACHED)) -+ flags |= BTREE_ITER_WITH_KEY_CACHE; -+ -+ return __bch2_btree_iter_flags(trans, btree_id, flags); -+} -+ -+static inline void bch2_trans_iter_init_common(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned locks_want, -+ unsigned depth, -+ unsigned flags, -+ unsigned long ip) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ iter->trans = trans; -+ iter->btree_id = btree_id; -+ iter->flags = flags; -+ iter->snapshot = pos.snapshot; -+ iter->pos = pos; -+ iter->k.p = pos; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ iter->ip_allocated = ip; -+#endif -+ iter->path = bch2_path_get(trans, btree_id, iter->pos, -+ locks_want, depth, flags, ip); -+} -+ -+void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, -+ enum btree_id, struct bpos, unsigned); -+ -+static inline void bch2_trans_iter_init(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ if (__builtin_constant_p(btree_id) && -+ __builtin_constant_p(flags)) -+ bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, -+ bch2_btree_iter_flags(trans, btree_id, flags), -+ _THIS_IP_); -+ else -+ bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); -+} -+ -+void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, -+ enum btree_id, struct bpos, -+ unsigned, unsigned, unsigned); -+void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); -+ -+static inline void set_btree_iter_dontneed(struct btree_iter *iter) -+{ -+ if (!iter->trans->restarted) -+ iter->path->preserve = false; -+} -+ -+void *__bch2_trans_kmalloc(struct btree_trans *, size_t); -+ -+static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ size = roundup(size, 8); -+ -+ if (likely(trans->mem_top + size <= trans->mem_bytes)) { -+ void *p = trans->mem + trans->mem_top; -+ -+ trans->mem_top += size; -+ memset(p, 0, size); -+ return p; -+ } else { -+ return __bch2_trans_kmalloc(trans, size); -+ } -+} -+ -+static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) -+{ -+ size = roundup(size, 8); -+ -+ if (likely(trans->mem_top + size <= trans->mem_bytes)) { -+ void *p = trans->mem + trans->mem_top; -+ -+ trans->mem_top += size; -+ return p; -+ } else { -+ return __bch2_trans_kmalloc(trans, size); -+ } -+} -+ -+static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags, unsigned type) -+{ -+ struct bkey_s_c k; -+ -+ bch2_trans_iter_init(trans, iter, btree_id, pos, flags); -+ k = bch2_btree_iter_peek_slot(iter); -+ -+ if (!bkey_err(k) && type && k.k->type != type) -+ k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); -+ if (unlikely(bkey_err(k))) -+ bch2_trans_iter_exit(trans, iter); -+ return k; -+} -+ -+static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); -+} -+ -+#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ -+ bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ -+ _btree_id, _pos, _flags, KEY_TYPE_##_type)) -+ -+static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags, unsigned type, -+ unsigned val_size, void *val) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); -+ ret = bkey_err(k); -+ if (!ret) { -+ unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); -+ -+ memcpy(val, k.v, b); -+ if (unlikely(b < sizeof(*val))) -+ memset((void *) val + b, 0, sizeof(*val) - b); -+ bch2_trans_iter_exit(trans, &iter); -+ } -+ -+ return ret; -+} -+ -+#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ -+ __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ -+ KEY_TYPE_##_type, sizeof(*_val), _val) -+ -+u32 bch2_trans_begin(struct btree_trans *); -+ -+/* -+ * XXX -+ * this does not handle transaction restarts from bch2_btree_iter_next_node() -+ * correctly -+ */ -+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _locks_want, _depth, _flags, _b, _ret) \ -+ for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ -+ _start, _locks_want, _depth, _flags); \ -+ (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ -+ !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ -+ (_b) = bch2_btree_iter_next_node(&(_iter))) -+ -+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _flags, _b, _ret) \ -+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ 0, 0, _flags, _b, _ret) -+ -+static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, -+ unsigned flags) -+{ -+ BUG_ON(flags & BTREE_ITER_ALL_LEVELS); -+ -+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : -+ bch2_btree_iter_peek_prev(iter); -+} -+ -+static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, -+ unsigned flags) -+{ -+ return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : -+ flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : -+ bch2_btree_iter_peek(iter); -+} -+ -+static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, -+ struct bpos end, -+ unsigned flags) -+{ -+ if (!(flags & BTREE_ITER_SLOTS)) -+ return bch2_btree_iter_peek_upto(iter, end); -+ -+ if (bkey_gt(iter->pos, end)) -+ return bkey_s_c_null; -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+static inline int btree_trans_too_many_iters(struct btree_trans *trans) -+{ -+ if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { -+ trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); -+ } -+ -+ return 0; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); -+ -+static inline struct bkey_s_c -+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned flags) -+{ -+ struct bkey_s_c k; -+ -+ while (btree_trans_too_many_iters(trans) || -+ (k = bch2_btree_iter_peek_type(iter, flags), -+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) -+ bch2_trans_begin(trans); -+ -+ return k; -+} -+ -+static inline struct bkey_s_c -+__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ unsigned flags) -+{ -+ struct bkey_s_c k; -+ -+ while (btree_trans_too_many_iters(trans) || -+ (k = bch2_btree_iter_peek_upto_type(iter, end, flags), -+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) -+ bch2_trans_begin(trans); -+ -+ return k; -+} -+ -+#define lockrestart_do(_trans, _do) \ -+({ \ -+ u32 _restart_count; \ -+ int _ret; \ -+ \ -+ do { \ -+ _restart_count = bch2_trans_begin(_trans); \ -+ _ret = (_do); \ -+ } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ -+ \ -+ if (!_ret) \ -+ bch2_trans_verify_not_restarted(_trans, _restart_count);\ -+ \ -+ _ret; \ -+}) -+ -+/* -+ * nested_lockrestart_do(), nested_commit_do(): -+ * -+ * These are like lockrestart_do() and commit_do(), with two differences: -+ * -+ * - We don't call bch2_trans_begin() unless we had a transaction restart -+ * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a -+ * transaction restart -+ */ -+#define nested_lockrestart_do(_trans, _do) \ -+({ \ -+ u32 _restart_count, _orig_restart_count; \ -+ int _ret; \ -+ \ -+ _restart_count = _orig_restart_count = (_trans)->restart_count; \ -+ \ -+ while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ -+ _restart_count = bch2_trans_begin(_trans); \ -+ \ -+ if (!_ret) \ -+ bch2_trans_verify_not_restarted(_trans, _restart_count);\ -+ \ -+ if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ -+ _ret = -BCH_ERR_transaction_restart_nested; \ -+ \ -+ _ret; \ -+}) -+ -+#define for_each_btree_key2(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _do) \ -+({ \ -+ int _ret = 0; \ -+ \ -+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ \ -+ while (1) { \ -+ u32 _restart_count = bch2_trans_begin(_trans); \ -+ \ -+ _ret = 0; \ -+ (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ -+ if (!(_k).k) \ -+ break; \ -+ \ -+ _ret = bkey_err(_k) ?: (_do); \ -+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ -+ continue; \ -+ if (_ret) \ -+ break; \ -+ bch2_trans_verify_not_restarted(_trans, _restart_count);\ -+ if (!bch2_btree_iter_advance(&(_iter))) \ -+ break; \ -+ } \ -+ \ -+ bch2_trans_iter_exit((_trans), &(_iter)); \ -+ _ret; \ -+}) -+ -+#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ -+ _start, _end, _flags, _k, _do) \ -+({ \ -+ int _ret = 0; \ -+ \ -+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ \ -+ while (1) { \ -+ u32 _restart_count = bch2_trans_begin(_trans); \ -+ \ -+ _ret = 0; \ -+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ -+ if (!(_k).k) \ -+ break; \ -+ \ -+ _ret = bkey_err(_k) ?: (_do); \ -+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ -+ continue; \ -+ if (_ret) \ -+ break; \ -+ bch2_trans_verify_not_restarted(_trans, _restart_count);\ -+ if (!bch2_btree_iter_advance(&(_iter))) \ -+ break; \ -+ } \ -+ \ -+ bch2_trans_iter_exit((_trans), &(_iter)); \ -+ _ret; \ -+}) -+ -+#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _do) \ -+({ \ -+ int _ret = 0; \ -+ \ -+ bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ \ -+ while (1) { \ -+ u32 _restart_count = bch2_trans_begin(_trans); \ -+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ -+ if (!(_k).k) { \ -+ _ret = 0; \ -+ break; \ -+ } \ -+ \ -+ _ret = bkey_err(_k) ?: (_do); \ -+ if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ -+ continue; \ -+ if (_ret) \ -+ break; \ -+ bch2_trans_verify_not_restarted(_trans, _restart_count);\ -+ if (!bch2_btree_iter_rewind(&(_iter))) \ -+ break; \ -+ } \ -+ \ -+ bch2_trans_iter_exit((_trans), &(_iter)); \ -+ _ret; \ -+}) -+ -+#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ -+ _start, _iter_flags, _k, \ -+ _disk_res, _journal_seq, _commit_flags,\ -+ _do) \ -+ for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ -+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ -+ (_journal_seq), (_commit_flags))) -+ -+#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ -+ _start, _iter_flags, _k, \ -+ _disk_res, _journal_seq, _commit_flags,\ -+ _do) \ -+ for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ -+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ -+ (_journal_seq), (_commit_flags))) -+ -+#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ -+ _start, _end, _iter_flags, _k, \ -+ _disk_res, _journal_seq, _commit_flags,\ -+ _do) \ -+ for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ -+ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ -+ (_journal_seq), (_commit_flags))) -+ -+#define for_each_btree_key(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ -+ _start, _end, _flags, _k, _ret) \ -+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ -+ &(_iter), _end, _flags),\ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ -+ _start, _end, _flags, _k, _ret) \ -+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ -+ (_start), (_flags)); \ -+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ -+ for (; \ -+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ -+ for (; \ -+ (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ -+ for (; \ -+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ bch2_btree_iter_advance(&(_iter))) -+ -+#define drop_locks_do(_trans, _do) \ -+({ \ -+ bch2_trans_unlock(_trans); \ -+ _do ?: bch2_trans_relock(_trans); \ -+}) -+ -+#define allocate_dropping_locks_errcode(_trans, _do) \ -+({ \ -+ gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ -+ int _ret = _do; \ -+ \ -+ if (bch2_err_matches(_ret, ENOMEM)) { \ -+ _gfp = GFP_KERNEL; \ -+ _ret = drop_locks_do(trans, _do); \ -+ } \ -+ _ret; \ -+}) -+ -+#define allocate_dropping_locks(_trans, _ret, _do) \ -+({ \ -+ gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ -+ typeof(_do) _p = _do; \ -+ \ -+ _ret = 0; \ -+ if (unlikely(!_p)) { \ -+ _gfp = GFP_KERNEL; \ -+ _ret = drop_locks_do(trans, ((_p = _do), 0)); \ -+ } \ -+ _p; \ -+}) -+ -+/* new multiple iterator interface: */ -+ -+void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); -+void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); -+void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -+void bch2_dump_trans_updates(struct btree_trans *); -+void bch2_dump_trans_paths_updates(struct btree_trans *); -+void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); -+void bch2_trans_exit(struct btree_trans *); -+ -+extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; -+unsigned bch2_trans_get_fn_idx(const char *); -+ -+#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ -+do { \ -+ static unsigned trans_fn_idx; \ -+ \ -+ if (unlikely(!trans_fn_idx)) \ -+ trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ -+ \ -+ __bch2_trans_init(_trans, _c, trans_fn_idx); \ -+} while (0) -+ -+void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *); -+int bch2_fs_btree_iter_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_ITER_H */ -diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c -new file mode 100644 -index 000000000..58a981bcf ---- /dev/null -+++ b/fs/bcachefs/btree_journal_iter.c -@@ -0,0 +1,531 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bset.h" -+#include "btree_journal_iter.h" -+#include "journal_io.h" -+ -+#include -+ -+/* -+ * For managing keys we read from the journal: until journal replay works normal -+ * btree lookups need to be able to find and return keys from the journal where -+ * they overwrite what's in the btree, so we have a special iterator and -+ * operations for the regular btree iter code to use: -+ */ -+ -+static int __journal_key_cmp(enum btree_id l_btree_id, -+ unsigned l_level, -+ struct bpos l_pos, -+ const struct journal_key *r) -+{ -+ return (cmp_int(l_btree_id, r->btree_id) ?: -+ cmp_int(l_level, r->level) ?: -+ bpos_cmp(l_pos, r->k->k.p)); -+} -+ -+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) -+{ -+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); -+} -+ -+static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) -+{ -+ size_t gap_size = keys->size - keys->nr; -+ -+ if (idx >= keys->gap) -+ idx += gap_size; -+ return idx; -+} -+ -+static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) -+{ -+ return keys->d + idx_to_pos(keys, idx); -+} -+ -+static size_t __bch2_journal_key_search(struct journal_keys *keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ size_t l = 0, r = keys->nr, m; -+ -+ while (l < r) { -+ m = l + ((r - l) >> 1); -+ if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ BUG_ON(l < keys->nr && -+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); -+ -+ BUG_ON(l && -+ __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); -+ -+ return l; -+} -+ -+static size_t bch2_journal_key_search(struct journal_keys *keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); -+} -+ -+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, -+ unsigned level, struct bpos pos, -+ struct bpos end_pos, size_t *idx) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ unsigned iters = 0; -+ struct journal_key *k; -+search: -+ if (!*idx) -+ *idx = __bch2_journal_key_search(keys, btree_id, level, pos); -+ -+ while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { -+ if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) -+ return NULL; -+ -+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && -+ !k->overwritten) -+ return k->k; -+ -+ (*idx)++; -+ iters++; -+ if (iters == 10) { -+ *idx = 0; -+ goto search; -+ } -+ } -+ -+ return NULL; -+} -+ -+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, -+ unsigned level, struct bpos pos) -+{ -+ size_t idx = 0; -+ -+ return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); -+} -+ -+static void journal_iters_fix(struct bch_fs *c) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ /* The key we just inserted is immediately before the gap: */ -+ size_t gap_end = keys->gap + (keys->size - keys->nr); -+ struct btree_and_journal_iter *iter; -+ -+ /* -+ * If an iterator points one after the key we just inserted, decrement -+ * the iterator so it points at the key we just inserted - if the -+ * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will -+ * handle that: -+ */ -+ list_for_each_entry(iter, &c->journal_iters, journal.list) -+ if (iter->journal.idx == gap_end) -+ iter->journal.idx = keys->gap - 1; -+} -+ -+static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ struct journal_iter *iter; -+ size_t gap_size = keys->size - keys->nr; -+ -+ list_for_each_entry(iter, &c->journal_iters, list) { -+ if (iter->idx > old_gap) -+ iter->idx -= gap_size; -+ if (iter->idx >= new_gap) -+ iter->idx += gap_size; -+ } -+} -+ -+int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ struct journal_key n = { -+ .btree_id = id, -+ .level = level, -+ .k = k, -+ .allocated = true, -+ /* -+ * Ensure these keys are done last by journal replay, to unblock -+ * journal reclaim: -+ */ -+ .journal_seq = U32_MAX, -+ }; -+ struct journal_keys *keys = &c->journal_keys; -+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); -+ -+ BUG_ON(test_bit(BCH_FS_RW, &c->flags)); -+ -+ if (idx < keys->size && -+ journal_key_cmp(&n, &keys->d[idx]) == 0) { -+ if (keys->d[idx].allocated) -+ kfree(keys->d[idx].k); -+ keys->d[idx] = n; -+ return 0; -+ } -+ -+ if (idx > keys->gap) -+ idx -= keys->size - keys->nr; -+ -+ if (keys->nr == keys->size) { -+ struct journal_keys new_keys = { -+ .nr = keys->nr, -+ .size = max_t(size_t, keys->size, 8) * 2, -+ }; -+ -+ new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); -+ if (!new_keys.d) { -+ bch_err(c, "%s: error allocating new key array (size %zu)", -+ __func__, new_keys.size); -+ return -BCH_ERR_ENOMEM_journal_key_insert; -+ } -+ -+ /* Since @keys was full, there was no gap: */ -+ memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); -+ kvfree(keys->d); -+ *keys = new_keys; -+ -+ /* And now the gap is at the end: */ -+ keys->gap = keys->nr; -+ } -+ -+ journal_iters_move_gap(c, keys->gap, idx); -+ -+ move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); -+ keys->gap = idx; -+ -+ keys->nr++; -+ keys->d[keys->gap++] = n; -+ -+ journal_iters_fix(c); -+ -+ return 0; -+} -+ -+/* -+ * Can only be used from the recovery thread while we're still RO - can't be -+ * used once we've got RW, as journal_keys is at that point used by multiple -+ * threads: -+ */ -+int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ struct bkey_i *n; -+ int ret; -+ -+ n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); -+ if (!n) -+ return -BCH_ERR_ENOMEM_journal_key_insert; -+ -+ bkey_copy(n, k); -+ ret = bch2_journal_key_insert_take(c, id, level, n); -+ if (ret) -+ kfree(n); -+ return ret; -+} -+ -+int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bpos pos) -+{ -+ struct bkey_i whiteout; -+ -+ bkey_init(&whiteout.k); -+ whiteout.k.p = pos; -+ -+ return bch2_journal_key_insert(c, id, level, &whiteout); -+} -+ -+void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, -+ unsigned level, struct bpos pos) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ size_t idx = bch2_journal_key_search(keys, btree, level, pos); -+ -+ if (idx < keys->size && -+ keys->d[idx].btree_id == btree && -+ keys->d[idx].level == level && -+ bpos_eq(keys->d[idx].k->k.p, pos)) -+ keys->d[idx].overwritten = true; -+} -+ -+static void bch2_journal_iter_advance(struct journal_iter *iter) -+{ -+ if (iter->idx < iter->keys->size) { -+ iter->idx++; -+ if (iter->idx == iter->keys->gap) -+ iter->idx += iter->keys->size - iter->keys->nr; -+ } -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) -+{ -+ struct journal_key *k = iter->keys->d + iter->idx; -+ -+ while (k < iter->keys->d + iter->keys->size && -+ k->btree_id == iter->btree_id && -+ k->level == iter->level) { -+ if (!k->overwritten) -+ return bkey_i_to_s_c(k->k); -+ -+ bch2_journal_iter_advance(iter); -+ k = iter->keys->d + iter->idx; -+ } -+ -+ return bkey_s_c_null; -+} -+ -+static void bch2_journal_iter_exit(struct journal_iter *iter) -+{ -+ list_del(&iter->list); -+} -+ -+static void bch2_journal_iter_init(struct bch_fs *c, -+ struct journal_iter *iter, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ iter->btree_id = id; -+ iter->level = level; -+ iter->keys = &c->journal_keys; -+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -+{ -+ return bch2_btree_node_iter_peek_unpack(&iter->node_iter, -+ iter->b, &iter->unpacked); -+} -+ -+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -+{ -+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -+} -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -+{ -+ if (bpos_eq(iter->pos, SPOS_MAX)) -+ iter->at_end = true; -+ else -+ iter->pos = bpos_successor(iter->pos); -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -+{ -+ struct bkey_s_c btree_k, journal_k, ret; -+again: -+ if (iter->at_end) -+ return bkey_s_c_null; -+ -+ while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && -+ bpos_lt(btree_k.k->p, iter->pos)) -+ bch2_journal_iter_advance_btree(iter); -+ -+ while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && -+ bpos_lt(journal_k.k->p, iter->pos)) -+ bch2_journal_iter_advance(&iter->journal); -+ -+ ret = journal_k.k && -+ (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) -+ ? journal_k -+ : btree_k; -+ -+ if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) -+ ret = bkey_s_c_null; -+ -+ if (ret.k) { -+ iter->pos = ret.k->p; -+ if (bkey_deleted(ret.k)) { -+ bch2_btree_and_journal_iter_advance(iter); -+ goto again; -+ } -+ } else { -+ iter->pos = SPOS_MAX; -+ iter->at_end = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) -+{ -+ bch2_journal_iter_exit(&iter->journal); -+} -+ -+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct bch_fs *c, -+ struct btree *b, -+ struct btree_node_iter node_iter, -+ struct bpos pos) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->b = b; -+ iter->node_iter = node_iter; -+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); -+ INIT_LIST_HEAD(&iter->journal.list); -+ iter->pos = b->data->min_key; -+ iter->at_end = false; -+} -+ -+/* -+ * this version is used by btree_gc before filesystem has gone RW and -+ * multithreaded, so uses the journal_iters list: -+ */ -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct bch_fs *c, -+ struct btree *b) -+{ -+ struct btree_node_iter node_iter; -+ -+ bch2_btree_node_iter_init_from_start(&node_iter, b); -+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); -+ list_add(&iter->journal.list, &c->journal_iters); -+} -+ -+/* sort and dedup all keys in the journal: */ -+ -+void bch2_journal_entries_free(struct bch_fs *c) -+{ -+ struct journal_replay **i; -+ struct genradix_iter iter; -+ -+ genradix_for_each(&c->journal_entries, iter, i) -+ if (*i) -+ kvpfree(*i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&(*i)->j)); -+ genradix_free(&c->journal_entries); -+} -+ -+/* -+ * When keys compare equal, oldest compares first: -+ */ -+static int journal_sort_key_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return journal_key_cmp(l, r) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+void bch2_journal_keys_free(struct journal_keys *keys) -+{ -+ struct journal_key *i; -+ -+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); -+ keys->gap = keys->nr; -+ -+ for (i = keys->d; i < keys->d + keys->nr; i++) -+ if (i->allocated) -+ kfree(i->k); -+ -+ kvfree(keys->d); -+ keys->d = NULL; -+ keys->nr = keys->gap = keys->size = 0; -+} -+ -+static void __journal_keys_sort(struct journal_keys *keys) -+{ -+ struct journal_key *src, *dst; -+ -+ sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); -+ -+ src = dst = keys->d; -+ while (src < keys->d + keys->nr) { -+ while (src + 1 < keys->d + keys->nr && -+ src[0].btree_id == src[1].btree_id && -+ src[0].level == src[1].level && -+ bpos_eq(src[0].k->k.p, src[1].k->k.p)) -+ src++; -+ -+ *dst++ = *src++; -+ } -+ -+ keys->nr = dst - keys->d; -+} -+ -+int bch2_journal_keys_sort(struct bch_fs *c) -+{ -+ struct genradix_iter iter; -+ struct journal_replay *i, **_i; -+ struct jset_entry *entry; -+ struct bkey_i *k; -+ struct journal_keys *keys = &c->journal_keys; -+ size_t nr_keys = 0, nr_read = 0; -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ for_each_jset_key(k, entry, &i->j) -+ nr_keys++; -+ } -+ -+ if (!nr_keys) -+ return 0; -+ -+ keys->size = roundup_pow_of_two(nr_keys); -+ -+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); -+ if (!keys->d) { -+ bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", -+ nr_keys); -+ -+ do { -+ keys->size >>= 1; -+ keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); -+ } while (!keys->d && keys->size > nr_keys / 8); -+ -+ if (!keys->d) { -+ bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", -+ keys->size); -+ return -BCH_ERR_ENOMEM_journal_keys_sort; -+ } -+ } -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ cond_resched(); -+ -+ for_each_jset_key(k, entry, &i->j) { -+ if (keys->nr == keys->size) { -+ __journal_keys_sort(keys); -+ -+ if (keys->nr > keys->size * 7 / 8) { -+ bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", -+ keys->nr, keys->size, nr_read, nr_keys); -+ return -BCH_ERR_ENOMEM_journal_keys_sort; -+ } -+ } -+ -+ keys->d[keys->nr++] = (struct journal_key) { -+ .btree_id = entry->btree_id, -+ .level = entry->level, -+ .k = k, -+ .journal_seq = le64_to_cpu(i->j.seq), -+ .journal_offset = k->_data - i->j._data, -+ }; -+ -+ nr_read++; -+ } -+ } -+ -+ __journal_keys_sort(keys); -+ keys->gap = keys->nr; -+ -+ bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); -+ return 0; -+} -diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h -new file mode 100644 -index 000000000..5d64e7e22 ---- /dev/null -+++ b/fs/bcachefs/btree_journal_iter.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H -+#define _BCACHEFS_BTREE_JOURNAL_ITER_H -+ -+struct journal_iter { -+ struct list_head list; -+ enum btree_id btree_id; -+ unsigned level; -+ size_t idx; -+ struct journal_keys *keys; -+}; -+ -+/* -+ * Iterate over keys in the btree, with keys from the journal overlaid on top: -+ */ -+ -+struct btree_and_journal_iter { -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey unpacked; -+ -+ struct journal_iter journal; -+ struct bpos pos; -+ bool at_end; -+}; -+ -+struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos, struct bpos, size_t *); -+struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+ -+int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, -+ unsigned, struct bkey_i *); -+int bch2_journal_key_insert(struct bch_fs *, enum btree_id, -+ unsigned, struct bkey_i *); -+int bch2_journal_key_delete(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, -+ unsigned, struct bpos); -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -+ -+void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct bch_fs *, struct btree *, -+ struct btree_node_iter, struct bpos); -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct bch_fs *, -+ struct btree *); -+ -+void bch2_journal_keys_free(struct journal_keys *); -+void bch2_journal_entries_free(struct bch_fs *); -+ -+int bch2_journal_keys_sort(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -new file mode 100644 -index 000000000..f7c001d42 ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,1088 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "errcode.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "trace.h" -+ -+#include -+#include -+ -+static inline bool btree_uses_pcpu_readers(enum btree_id id) -+{ -+ return id == BTREE_ID_subvolumes; -+} -+ -+static struct kmem_cache *bch2_key_cache; -+ -+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct bkey_cached *ck = obj; -+ const struct bkey_cached_key *key = arg->key; -+ -+ return ck->key.btree_id != key->btree_id || -+ !bpos_eq(ck->key.pos, key->pos); -+} -+ -+static const struct rhashtable_params bch2_btree_key_cache_params = { -+ .head_offset = offsetof(struct bkey_cached, hash), -+ .key_offset = offsetof(struct bkey_cached, key), -+ .key_len = sizeof(struct bkey_cached_key), -+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, -+}; -+ -+__flatten -+inline struct bkey_cached * -+bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -+{ -+ struct bkey_cached_key key = { -+ .btree_id = btree_id, -+ .pos = pos, -+ }; -+ -+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, -+ bch2_btree_key_cache_params); -+} -+ -+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -+{ -+ if (!six_trylock_intent(&ck->c.lock)) -+ return false; -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ if (!six_trylock_write(&ck->c.lock)) { -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void bkey_cached_evict(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, -+ bch2_btree_key_cache_params)); -+ memset(&ck->key, ~0, sizeof(ck->key)); -+ -+ atomic_long_dec(&c->nr_keys); -+} -+ -+static void bkey_cached_free(struct btree_key_cache *bc, -+ struct bkey_cached *ck) -+{ -+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); -+ -+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); -+ -+ ck->btree_trans_barrier_seq = -+ start_poll_synchronize_srcu(&c->btree_trans_barrier); -+ -+ if (ck->c.lock.readers) -+ list_move_tail(&ck->list, &bc->freed_pcpu); -+ else -+ list_move_tail(&ck->list, &bc->freed_nonpcpu); -+ atomic_long_inc(&bc->nr_freed); -+ -+ kfree(ck->k); -+ ck->k = NULL; -+ ck->u64s = 0; -+ -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+} -+ -+#ifdef __KERNEL__ -+static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, -+ struct bkey_cached *ck) -+{ -+ struct bkey_cached *pos; -+ -+ list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { -+ if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, -+ pos->btree_trans_barrier_seq)) { -+ list_move(&ck->list, &pos->list); -+ return; -+ } -+ } -+ -+ list_move(&ck->list, &bc->freed_nonpcpu); -+} -+#endif -+ -+static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, -+ struct bkey_cached *ck) -+{ -+ BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); -+ -+ if (!ck->c.lock.readers) { -+#ifdef __KERNEL__ -+ struct btree_key_cache_freelist *f; -+ bool freed = false; -+ -+ preempt_disable(); -+ f = this_cpu_ptr(bc->pcpu_freed); -+ -+ if (f->nr < ARRAY_SIZE(f->objs)) { -+ f->objs[f->nr++] = ck; -+ freed = true; -+ } -+ preempt_enable(); -+ -+ if (!freed) { -+ mutex_lock(&bc->lock); -+ preempt_disable(); -+ f = this_cpu_ptr(bc->pcpu_freed); -+ -+ while (f->nr > ARRAY_SIZE(f->objs) / 2) { -+ struct bkey_cached *ck2 = f->objs[--f->nr]; -+ -+ __bkey_cached_move_to_freelist_ordered(bc, ck2); -+ } -+ preempt_enable(); -+ -+ __bkey_cached_move_to_freelist_ordered(bc, ck); -+ mutex_unlock(&bc->lock); -+ } -+#else -+ mutex_lock(&bc->lock); -+ list_move_tail(&ck->list, &bc->freed_nonpcpu); -+ mutex_unlock(&bc->lock); -+#endif -+ } else { -+ mutex_lock(&bc->lock); -+ list_move_tail(&ck->list, &bc->freed_pcpu); -+ mutex_unlock(&bc->lock); -+ } -+} -+ -+static void bkey_cached_free_fast(struct btree_key_cache *bc, -+ struct bkey_cached *ck) -+{ -+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); -+ -+ ck->btree_trans_barrier_seq = -+ start_poll_synchronize_srcu(&c->btree_trans_barrier); -+ -+ list_del_init(&ck->list); -+ atomic_long_inc(&bc->nr_freed); -+ -+ kfree(ck->k); -+ ck->k = NULL; -+ ck->u64s = 0; -+ -+ bkey_cached_move_to_freelist(bc, ck); -+ -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+} -+ -+static struct bkey_cached * -+bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, -+ bool *was_new) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_key_cache *bc = &c->btree_key_cache; -+ struct bkey_cached *ck = NULL; -+ bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); -+ int ret; -+ -+ if (!pcpu_readers) { -+#ifdef __KERNEL__ -+ struct btree_key_cache_freelist *f; -+ -+ preempt_disable(); -+ f = this_cpu_ptr(bc->pcpu_freed); -+ if (f->nr) -+ ck = f->objs[--f->nr]; -+ preempt_enable(); -+ -+ if (!ck) { -+ mutex_lock(&bc->lock); -+ preempt_disable(); -+ f = this_cpu_ptr(bc->pcpu_freed); -+ -+ while (!list_empty(&bc->freed_nonpcpu) && -+ f->nr < ARRAY_SIZE(f->objs) / 2) { -+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); -+ list_del_init(&ck->list); -+ f->objs[f->nr++] = ck; -+ } -+ -+ ck = f->nr ? f->objs[--f->nr] : NULL; -+ preempt_enable(); -+ mutex_unlock(&bc->lock); -+ } -+#else -+ mutex_lock(&bc->lock); -+ if (!list_empty(&bc->freed_nonpcpu)) { -+ ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); -+ list_del_init(&ck->list); -+ } -+ mutex_unlock(&bc->lock); -+#endif -+ } else { -+ mutex_lock(&bc->lock); -+ if (!list_empty(&bc->freed_pcpu)) { -+ ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); -+ list_del_init(&ck->list); -+ } -+ mutex_unlock(&bc->lock); -+ } -+ -+ if (ck) { -+ int ret; -+ -+ ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); -+ if (unlikely(ret)) { -+ bkey_cached_move_to_freelist(bc, ck); -+ return ERR_PTR(ret); -+ } -+ -+ path->l[0].b = (void *) ck; -+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock); -+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); -+ -+ ret = bch2_btree_node_lock_write(trans, path, &ck->c); -+ if (unlikely(ret)) { -+ btree_node_unlock(trans, path, 0); -+ bkey_cached_move_to_freelist(bc, ck); -+ return ERR_PTR(ret); -+ } -+ -+ return ck; -+ } -+ -+ ck = allocate_dropping_locks(trans, ret, -+ kmem_cache_zalloc(bch2_key_cache, _gfp)); -+ if (ret) { -+ kmem_cache_free(bch2_key_cache, ck); -+ return ERR_PTR(ret); -+ } -+ -+ if (!ck) -+ return NULL; -+ -+ INIT_LIST_HEAD(&ck->list); -+ bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); -+ -+ ck->c.cached = true; -+ BUG_ON(!six_trylock_intent(&ck->c.lock)); -+ BUG_ON(!six_trylock_write(&ck->c.lock)); -+ *was_new = true; -+ return ck; -+} -+ -+static struct bkey_cached * -+bkey_cached_reuse(struct btree_key_cache *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct bkey_cached *ck; -+ unsigned i; -+ -+ mutex_lock(&c->lock); -+ rcu_read_lock(); -+ tbl = rht_dereference_rcu(c->table.tbl, &c->table); -+ for (i = 0; i < tbl->size; i++) -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && -+ bkey_cached_lock_for_evict(ck)) { -+ bkey_cached_evict(c, ck); -+ goto out; -+ } -+ } -+ ck = NULL; -+out: -+ rcu_read_unlock(); -+ mutex_unlock(&c->lock); -+ return ck; -+} -+ -+static struct bkey_cached * -+btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_key_cache *bc = &c->btree_key_cache; -+ struct bkey_cached *ck; -+ bool was_new = false; -+ -+ ck = bkey_cached_alloc(trans, path, &was_new); -+ if (IS_ERR(ck)) -+ return ck; -+ -+ if (unlikely(!ck)) { -+ ck = bkey_cached_reuse(bc); -+ if (unlikely(!ck)) { -+ bch_err(c, "error allocating memory for key cache item, btree %s", -+ bch2_btree_ids[path->btree_id]); -+ return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); -+ } -+ -+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); -+ } -+ -+ ck->c.level = 0; -+ ck->c.btree_id = path->btree_id; -+ ck->key.btree_id = path->btree_id; -+ ck->key.pos = path->pos; -+ ck->valid = false; -+ ck->flags = 1U << BKEY_CACHED_ACCESSED; -+ -+ if (unlikely(rhashtable_lookup_insert_fast(&bc->table, -+ &ck->hash, -+ bch2_btree_key_cache_params))) { -+ /* We raced with another fill: */ -+ -+ if (likely(was_new)) { -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+ kfree(ck); -+ } else { -+ bkey_cached_free_fast(bc, ck); -+ } -+ -+ mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); -+ return NULL; -+ } -+ -+ atomic_long_inc(&bc->nr_keys); -+ -+ six_unlock_write(&ck->c.lock); -+ -+ return ck; -+} -+ -+static int btree_key_cache_fill(struct btree_trans *trans, -+ struct btree_path *ck_path, -+ struct bkey_cached *ck) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ unsigned new_u64s = 0; -+ struct bkey_i *new_k = NULL; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, -+ BTREE_ITER_KEY_CACHE_FILL| -+ BTREE_ITER_CACHED_NOFILL); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!bch2_btree_node_relock(trans, ck_path, 0)) { -+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); -+ goto err; -+ } -+ -+ /* -+ * bch2_varint_decode can read past the end of the buffer by at -+ * most 7 bytes (it won't be used): -+ */ -+ new_u64s = k.k->u64s + 1; -+ -+ /* -+ * Allocate some extra space so that the transaction commit path is less -+ * likely to have to reallocate, since that requires a transaction -+ * restart: -+ */ -+ new_u64s = min(256U, (new_u64s * 3) / 2); -+ -+ if (new_u64s > ck->u64s) { -+ new_u64s = roundup_pow_of_two(new_u64s); -+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); -+ if (!new_k) { -+ bch2_trans_unlock(trans); -+ -+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); -+ if (!new_k) { -+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", -+ bch2_btree_ids[ck->key.btree_id], new_u64s); -+ ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; -+ goto err; -+ } -+ -+ if (!bch2_btree_node_relock(trans, ck_path, 0)) { -+ kfree(new_k); -+ trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); -+ goto err; -+ } -+ -+ ret = bch2_trans_relock(trans); -+ if (ret) { -+ kfree(new_k); -+ goto err; -+ } -+ } -+ } -+ -+ ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); -+ if (ret) { -+ kfree(new_k); -+ goto err; -+ } -+ -+ if (new_k) { -+ kfree(ck->k); -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ } -+ -+ bkey_reassemble(ck->k, k); -+ ck->valid = true; -+ bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); -+ -+ /* We're not likely to need this iterator again: */ -+ set_btree_iter_dontneed(&iter); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static noinline int -+bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck; -+ int ret = 0; -+ -+ BUG_ON(path->level); -+ -+ path->l[1].b = NULL; -+ -+ if (bch2_btree_node_relock_notrace(trans, path, 0)) { -+ ck = (void *) path->l[0].b; -+ goto fill; -+ } -+retry: -+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); -+ if (!ck) { -+ ck = btree_key_cache_create(trans, path); -+ ret = PTR_ERR_OR_ZERO(ck); -+ if (ret) -+ goto err; -+ if (!ck) -+ goto retry; -+ -+ mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); -+ path->locks_want = 1; -+ } else { -+ enum six_lock_type lock_want = __btree_lock_want(path, 0); -+ -+ ret = btree_node_lock(trans, path, (void *) ck, 0, -+ lock_want, _THIS_IP_); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto err; -+ -+ BUG_ON(ret); -+ -+ if (ck->key.btree_id != path->btree_id || -+ !bpos_eq(ck->key.pos, path->pos)) { -+ six_unlock_type(&ck->c.lock, lock_want); -+ goto retry; -+ } -+ -+ mark_btree_node_locked(trans, path, 0, lock_want); -+ } -+ -+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock); -+ path->l[0].b = (void *) ck; -+fill: -+ path->uptodate = BTREE_ITER_UPTODATE; -+ -+ if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { -+ /* -+ * Using the underscore version because we haven't set -+ * path->uptodate yet: -+ */ -+ if (!path->locks_want && -+ !__bch2_btree_path_upgrade(trans, path, 1)) { -+ trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); -+ goto err; -+ } -+ -+ ret = btree_key_cache_fill(trans, path, ck); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_path_relock(trans, path, _THIS_IP_); -+ if (ret) -+ goto err; -+ -+ path->uptodate = BTREE_ITER_UPTODATE; -+ } -+ -+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) -+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags); -+ -+ BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); -+ BUG_ON(path->uptodate); -+ -+ return ret; -+err: -+ path->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ btree_node_unlock(trans, path, 0); -+ path->l[0].b = ERR_PTR(ret); -+ } -+ return ret; -+} -+ -+int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck; -+ int ret = 0; -+ -+ EBUG_ON(path->level); -+ -+ path->l[1].b = NULL; -+ -+ if (bch2_btree_node_relock_notrace(trans, path, 0)) { -+ ck = (void *) path->l[0].b; -+ goto fill; -+ } -+retry: -+ ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); -+ if (!ck) { -+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); -+ } else { -+ enum six_lock_type lock_want = __btree_lock_want(path, 0); -+ -+ ret = btree_node_lock(trans, path, (void *) ck, 0, -+ lock_want, _THIS_IP_); -+ EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); -+ -+ if (ret) -+ return ret; -+ -+ if (ck->key.btree_id != path->btree_id || -+ !bpos_eq(ck->key.pos, path->pos)) { -+ six_unlock_type(&ck->c.lock, lock_want); -+ goto retry; -+ } -+ -+ mark_btree_node_locked(trans, path, 0, lock_want); -+ } -+ -+ path->l[0].lock_seq = six_lock_seq(&ck->c.lock); -+ path->l[0].b = (void *) ck; -+fill: -+ if (!ck->valid) -+ return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); -+ -+ if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) -+ set_bit(BKEY_CACHED_ACCESSED, &ck->flags); -+ -+ path->uptodate = BTREE_ITER_UPTODATE; -+ EBUG_ON(!ck->valid); -+ EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); -+ -+ return ret; -+} -+ -+static int btree_key_cache_flush_pos(struct btree_trans *trans, -+ struct bkey_cached_key key, -+ u64 journal_seq, -+ unsigned commit_flags, -+ bool evict) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct btree_iter c_iter, b_iter; -+ struct bkey_cached *ck = NULL; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_INTENT| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; -+ -+ ret = bch2_btree_iter_traverse(&c_iter); -+ if (ret) -+ goto out; -+ -+ ck = (void *) c_iter.path->l[0].b; -+ if (!ck) -+ goto out; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ if (evict) -+ goto evict; -+ goto out; -+ } -+ -+ BUG_ON(!ck->valid); -+ -+ if (journal_seq && ck->journal.seq != journal_seq) -+ goto out; -+ -+ /* -+ * Since journal reclaim depends on us making progress here, and the -+ * allocator/copygc depend on journal reclaim making progress, we need -+ * to be using alloc reserves: -+ */ -+ ret = bch2_btree_iter_traverse(&b_iter) ?: -+ bch2_trans_update(trans, &b_iter, ck->k, -+ BTREE_UPDATE_KEY_CACHE_RECLAIM| -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| -+ BTREE_TRIGGER_NORUN) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ (ck->journal.seq == journal_last_seq(j) -+ ? BCH_WATERMARK_reclaim -+ : 0)| -+ commit_flags); -+ -+ bch2_fs_fatal_err_on(ret && -+ !bch2_err_matches(ret, BCH_ERR_transaction_restart) && -+ !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && -+ !bch2_journal_error(j), c, -+ "error flushing key cache: %s", bch2_err_str(ret)); -+ if (ret) -+ goto out; -+ -+ bch2_journal_pin_drop(j, &ck->journal); -+ bch2_journal_preres_put(j, &ck->res); -+ -+ BUG_ON(!btree_node_locked(c_iter.path, 0)); -+ -+ if (!evict) { -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ atomic_long_dec(&c->btree_key_cache.nr_dirty); -+ } -+ } else { -+ struct btree_path *path2; -+evict: -+ trans_for_each_path(trans, path2) -+ if (path2 != c_iter.path) -+ __bch2_btree_path_unlock(trans, path2); -+ -+ bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ atomic_long_dec(&c->btree_key_cache.nr_dirty); -+ } -+ -+ mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); -+ bkey_cached_evict(&c->btree_key_cache, ck); -+ bkey_cached_free_fast(&c->btree_key_cache, ck); -+ } -+out: -+ bch2_trans_iter_exit(trans, &b_iter); -+ bch2_trans_iter_exit(trans, &c_iter); -+ return ret; -+} -+ -+int bch2_btree_key_cache_journal_flush(struct journal *j, -+ struct journal_entry_pin *pin, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bkey_cached *ck = -+ container_of(pin, struct bkey_cached, journal); -+ struct bkey_cached_key key; -+ struct btree_trans trans; -+ int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); -+ key = ck->key; -+ -+ if (ck->journal.seq != seq || -+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_read(&ck->c.lock); -+ goto unlock; -+ } -+ -+ if (ck->seq != seq) { -+ bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, -+ bch2_btree_key_cache_journal_flush); -+ six_unlock_read(&ck->c.lock); -+ goto unlock; -+ } -+ six_unlock_read(&ck->c.lock); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ btree_key_cache_flush_pos(&trans, key, seq, -+ BTREE_INSERT_JOURNAL_RECLAIM, false)); -+unlock: -+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+/* -+ * Flush and evict a key from the key cache: -+ */ -+int bch2_btree_key_cache_flush(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached_key key = { id, pos }; -+ -+ /* Fastpath - assume it won't be found: */ -+ if (!bch2_btree_key_cache_find(c, id, pos)) -+ return 0; -+ -+ return btree_key_cache_flush_pos(trans, key, 0, 0, true); -+} -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *trans, -+ unsigned flags, -+ struct btree_insert_entry *insert_entry) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; -+ struct bkey_i *insert = insert_entry->k; -+ bool kick_reclaim = false; -+ -+ BUG_ON(insert->k.u64s > ck->u64s); -+ -+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ int difference; -+ -+ BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); -+ -+ difference = jset_u64s(insert->k.u64s) - ck->res.u64s; -+ if (difference > 0) { -+ trans->journal_preres.u64s -= difference; -+ ck->res.u64s += difference; -+ } -+ } -+ -+ bkey_copy(ck->k, insert); -+ ck->valid = true; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); -+ set_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ atomic_long_inc(&c->btree_key_cache.nr_dirty); -+ -+ if (bch2_nr_btree_keys_need_flush(c)) -+ kick_reclaim = true; -+ } -+ -+ /* -+ * To minimize lock contention, we only add the journal pin here and -+ * defer pin updates to the flush callback via ->seq. Be careful not to -+ * update ->seq on nojournal commits because we don't want to update the -+ * pin to a seq that doesn't include journal updates on disk. Otherwise -+ * we risk losing the update after a crash. -+ * -+ * The only exception is if the pin is not active in the first place. We -+ * have to add the pin because journal reclaim drives key cache -+ * flushing. The flush callback will not proceed unless ->seq matches -+ * the latest pin, so make sure it starts with a consistent value. -+ */ -+ if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || -+ !journal_pin_active(&ck->journal)) { -+ ck->seq = trans->journal_res.seq; -+ } -+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, -+ &ck->journal, bch2_btree_key_cache_journal_flush); -+ -+ if (kick_reclaim) -+ journal_reclaim_kick(&c->journal); -+ return true; -+} -+ -+void bch2_btree_key_cache_drop(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) path->l[0].b; -+ -+ BUG_ON(!ck->valid); -+ -+ /* -+ * We just did an update to the btree, bypassing the key cache: the key -+ * cache key is now stale and must be dropped, even if dirty: -+ */ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ atomic_long_dec(&c->btree_key_cache.nr_dirty); -+ bch2_journal_pin_drop(&c->journal, &ck->journal); -+ } -+ -+ ck->valid = false; -+} -+ -+static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_key_cache.shrink); -+ struct btree_key_cache *bc = &c->btree_key_cache; -+ struct bucket_table *tbl; -+ struct bkey_cached *ck, *t; -+ size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; -+ unsigned start, flags; -+ int srcu_idx; -+ -+ mutex_lock(&bc->lock); -+ srcu_idx = srcu_read_lock(&c->btree_trans_barrier); -+ flags = memalloc_nofs_save(); -+ -+ /* -+ * Newest freed entries are at the end of the list - once we hit one -+ * that's too new to be freed, we can bail out: -+ */ -+ list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { -+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, -+ ck->btree_trans_barrier_seq)) -+ break; -+ -+ list_del(&ck->list); -+ six_lock_exit(&ck->c.lock); -+ kmem_cache_free(bch2_key_cache, ck); -+ atomic_long_dec(&bc->nr_freed); -+ scanned++; -+ freed++; -+ } -+ -+ if (scanned >= nr) -+ goto out; -+ -+ list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { -+ if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, -+ ck->btree_trans_barrier_seq)) -+ break; -+ -+ list_del(&ck->list); -+ six_lock_exit(&ck->c.lock); -+ kmem_cache_free(bch2_key_cache, ck); -+ atomic_long_dec(&bc->nr_freed); -+ scanned++; -+ freed++; -+ } -+ -+ if (scanned >= nr) -+ goto out; -+ -+ rcu_read_lock(); -+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); -+ if (bc->shrink_iter >= tbl->size) -+ bc->shrink_iter = 0; -+ start = bc->shrink_iter; -+ -+ do { -+ struct rhash_head *pos, *next; -+ -+ pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); -+ -+ while (!rht_is_a_nulls(pos)) { -+ next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); -+ ck = container_of(pos, struct bkey_cached, hash); -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) -+ goto next; -+ -+ if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) -+ clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); -+ else if (bkey_cached_lock_for_evict(ck)) { -+ bkey_cached_evict(bc, ck); -+ bkey_cached_free(bc, ck); -+ } -+ -+ scanned++; -+ if (scanned >= nr) -+ break; -+next: -+ pos = next; -+ } -+ -+ bc->shrink_iter++; -+ if (bc->shrink_iter >= tbl->size) -+ bc->shrink_iter = 0; -+ } while (scanned < nr && bc->shrink_iter != start); -+ -+ rcu_read_unlock(); -+out: -+ memalloc_nofs_restore(flags); -+ srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); -+ mutex_unlock(&bc->lock); -+ -+ return freed; -+} -+ -+static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_key_cache.shrink); -+ struct btree_key_cache *bc = &c->btree_key_cache; -+ long nr = atomic_long_read(&bc->nr_keys) - -+ atomic_long_read(&bc->nr_dirty); -+ -+ return max(0L, nr); -+} -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) -+{ -+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); -+ struct bucket_table *tbl; -+ struct bkey_cached *ck, *n; -+ struct rhash_head *pos; -+ LIST_HEAD(items); -+ unsigned i; -+#ifdef __KERNEL__ -+ int cpu; -+#endif -+ -+ unregister_shrinker(&bc->shrink); -+ -+ mutex_lock(&bc->lock); -+ -+ /* -+ * The loop is needed to guard against racing with rehash: -+ */ -+ while (atomic_long_read(&bc->nr_keys)) { -+ rcu_read_lock(); -+ tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); -+ if (tbl) -+ for (i = 0; i < tbl->size; i++) -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ bkey_cached_evict(bc, ck); -+ list_add(&ck->list, &items); -+ } -+ rcu_read_unlock(); -+ } -+ -+#ifdef __KERNEL__ -+ for_each_possible_cpu(cpu) { -+ struct btree_key_cache_freelist *f = -+ per_cpu_ptr(bc->pcpu_freed, cpu); -+ -+ for (i = 0; i < f->nr; i++) { -+ ck = f->objs[i]; -+ list_add(&ck->list, &items); -+ } -+ } -+#endif -+ -+ list_splice(&bc->freed_pcpu, &items); -+ list_splice(&bc->freed_nonpcpu, &items); -+ -+ mutex_unlock(&bc->lock); -+ -+ list_for_each_entry_safe(ck, n, &items, list) { -+ cond_resched(); -+ -+ bch2_journal_pin_drop(&c->journal, &ck->journal); -+ bch2_journal_preres_put(&c->journal, &ck->res); -+ -+ list_del(&ck->list); -+ kfree(ck->k); -+ six_lock_exit(&ck->c.lock); -+ kmem_cache_free(bch2_key_cache, ck); -+ } -+ -+ if (atomic_long_read(&bc->nr_dirty) && -+ !bch2_journal_error(&c->journal) && -+ test_bit(BCH_FS_WAS_RW, &c->flags)) -+ panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", -+ atomic_long_read(&bc->nr_dirty)); -+ -+ if (atomic_long_read(&bc->nr_keys)) -+ panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", -+ atomic_long_read(&bc->nr_keys)); -+ -+ if (bc->table_init_done) -+ rhashtable_destroy(&bc->table); -+ -+ free_percpu(bc->pcpu_freed); -+} -+ -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -+{ -+ mutex_init(&c->lock); -+ INIT_LIST_HEAD(&c->freed_pcpu); -+ INIT_LIST_HEAD(&c->freed_nonpcpu); -+} -+ -+static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) -+{ -+ struct btree_key_cache *bc = -+ container_of(shrink, struct btree_key_cache, shrink); -+ char *cbuf; -+ size_t buflen = seq_buf_get_buf(s, &cbuf); -+ struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); -+ -+ bch2_btree_key_cache_to_text(&out, bc); -+ seq_buf_commit(s, out.pos); -+} -+ -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) -+{ -+ struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); -+ -+#ifdef __KERNEL__ -+ bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); -+ if (!bc->pcpu_freed) -+ return -BCH_ERR_ENOMEM_fs_btree_cache_init; -+#endif -+ -+ if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) -+ return -BCH_ERR_ENOMEM_fs_btree_cache_init; -+ -+ bc->table_init_done = true; -+ -+ bc->shrink.seeks = 0; -+ bc->shrink.count_objects = bch2_btree_key_cache_count; -+ bc->shrink.scan_objects = bch2_btree_key_cache_scan; -+ bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; -+ if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) -+ return -BCH_ERR_ENOMEM_fs_btree_cache_init; -+ return 0; -+} -+ -+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) -+{ -+ prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); -+ prt_newline(out); -+ prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); -+ prt_newline(out); -+ prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); -+ prt_newline(out); -+} -+ -+void bch2_btree_key_cache_exit(void) -+{ -+ kmem_cache_destroy(bch2_key_cache); -+} -+ -+int __init bch2_btree_key_cache_init(void) -+{ -+ bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); -+ if (!bch2_key_cache) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h -new file mode 100644 -index 000000000..be3acde2c ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,48 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -+#define _BCACHEFS_BTREE_KEY_CACHE_H -+ -+static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) -+{ -+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); -+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); -+ size_t max_dirty = 1024 + nr_keys / 2; -+ -+ return max_t(ssize_t, 0, nr_dirty - max_dirty); -+} -+ -+static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) -+{ -+ size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); -+ size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); -+ size_t max_dirty = 4096 + (nr_keys * 3) / 4; -+ -+ return nr_dirty > max_dirty; -+} -+ -+int bch2_btree_key_cache_journal_flush(struct journal *, -+ struct journal_entry_pin *, u64); -+ -+struct bkey_cached * -+bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); -+ -+int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, -+ unsigned); -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, -+ struct btree_insert_entry *); -+int bch2_btree_key_cache_flush(struct btree_trans *, -+ enum btree_id, struct bpos); -+void bch2_btree_key_cache_drop(struct btree_trans *, -+ struct btree_path *); -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *); -+ -+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); -+ -+void bch2_btree_key_cache_exit(void); -+int __init bch2_btree_key_cache_init(void); -+ -+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ -diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c -new file mode 100644 -index 000000000..0b0f9d607 ---- /dev/null -+++ b/fs/bcachefs/btree_locking.c -@@ -0,0 +1,797 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_locking.h" -+#include "btree_types.h" -+ -+static struct lock_class_key bch2_btree_node_lock_key; -+ -+void bch2_btree_lock_init(struct btree_bkey_cached_common *b, -+ enum six_lock_init_flags flags) -+{ -+ __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ lockdep_set_no_check_recursion(&b->lock.dep_map); -+#endif -+} -+ -+#ifdef CONFIG_LOCKDEP -+void bch2_assert_btree_nodes_not_locked(void) -+{ -+ BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); -+} -+#endif -+ -+/* Btree node locking: */ -+ -+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, -+ struct btree_path *skip, -+ struct btree_bkey_cached_common *b, -+ unsigned level) -+{ -+ struct btree_path *path; -+ struct six_lock_count ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ -+ if (IS_ERR_OR_NULL(b)) -+ return ret; -+ -+ trans_for_each_path(trans, path) -+ if (path != skip && &path->l[level].b->c == b) { -+ int t = btree_node_locked_type(path, level); -+ -+ if (t != BTREE_NODE_UNLOCKED) -+ ret.n[t]++; -+ } -+ -+ return ret; -+} -+ -+/* unlock */ -+ -+void bch2_btree_node_unlock_write(struct btree_trans *trans, -+ struct btree_path *path, struct btree *b) -+{ -+ bch2_btree_node_unlock_write_inlined(trans, path, b); -+} -+ -+/* lock */ -+ -+/* -+ * @trans wants to lock @b with type @type -+ */ -+struct trans_waiting_for_lock { -+ struct btree_trans *trans; -+ struct btree_bkey_cached_common *node_want; -+ enum six_lock_type lock_want; -+ -+ /* for iterating over held locks :*/ -+ u8 path_idx; -+ u8 level; -+ u64 lock_start_time; -+}; -+ -+struct lock_graph { -+ struct trans_waiting_for_lock g[8]; -+ unsigned nr; -+}; -+ -+static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) -+{ -+ struct trans_waiting_for_lock *i; -+ -+ prt_printf(out, "Found lock cycle (%u entries):", g->nr); -+ prt_newline(out); -+ -+ for (i = g->g; i < g->g + g->nr; i++) -+ bch2_btree_trans_to_text(out, i->trans); -+} -+ -+static noinline void print_chain(struct printbuf *out, struct lock_graph *g) -+{ -+ struct trans_waiting_for_lock *i; -+ -+ for (i = g->g; i != g->g + g->nr; i++) { -+ if (i != g->g) -+ prt_str(out, "<- "); -+ prt_printf(out, "%u ", i->trans->locking_wait.task->pid); -+ } -+ prt_newline(out); -+} -+ -+static void lock_graph_up(struct lock_graph *g) -+{ -+ closure_put(&g->g[--g->nr].trans->ref); -+} -+ -+static noinline void lock_graph_pop_all(struct lock_graph *g) -+{ -+ while (g->nr) -+ lock_graph_up(g); -+} -+ -+static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -+{ -+ g->g[g->nr++] = (struct trans_waiting_for_lock) { -+ .trans = trans, -+ .node_want = trans->locking, -+ .lock_want = trans->locking_wait.lock_want, -+ }; -+} -+ -+static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) -+{ -+ closure_get(&trans->ref); -+ __lock_graph_down(g, trans); -+} -+ -+static bool lock_graph_remove_non_waiters(struct lock_graph *g) -+{ -+ struct trans_waiting_for_lock *i; -+ -+ for (i = g->g + 1; i < g->g + g->nr; i++) -+ if (i->trans->locking != i->node_want || -+ i->trans->locking_wait.start_time != i[-1].lock_start_time) { -+ while (g->g + g->nr > i) -+ lock_graph_up(g); -+ return true; -+ } -+ -+ return false; -+} -+ -+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) -+{ -+ if (i == g->g) { -+ trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); -+ return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); -+ } else { -+ i->trans->lock_must_abort = true; -+ wake_up_process(i->trans->locking_wait.task); -+ return 0; -+ } -+} -+ -+static int btree_trans_abort_preference(struct btree_trans *trans) -+{ -+ if (trans->lock_may_not_fail) -+ return 0; -+ if (trans->locking_wait.lock_want == SIX_LOCK_write) -+ return 1; -+ if (!trans->in_traverse_all) -+ return 2; -+ return 3; -+} -+ -+static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) -+{ -+ struct trans_waiting_for_lock *i, *abort = NULL; -+ unsigned best = 0, pref; -+ int ret; -+ -+ if (lock_graph_remove_non_waiters(g)) -+ return 0; -+ -+ /* Only checking, for debugfs: */ -+ if (cycle) { -+ print_cycle(cycle, g); -+ ret = -1; -+ goto out; -+ } -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ pref = btree_trans_abort_preference(i->trans); -+ if (pref > best) { -+ abort = i; -+ best = pref; -+ } -+ } -+ -+ if (unlikely(!best)) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); -+ -+ for (i = g->g; i < g->g + g->nr; i++) { -+ struct btree_trans *trans = i->trans; -+ -+ bch2_btree_trans_to_text(&buf, trans); -+ -+ prt_printf(&buf, "backtrace:"); -+ prt_newline(&buf); -+ printbuf_indent_add(&buf, 2); -+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task); -+ printbuf_indent_sub(&buf, 2); -+ prt_newline(&buf); -+ } -+ -+ bch2_print_string_as_lines(KERN_ERR, buf.buf); -+ printbuf_exit(&buf); -+ BUG(); -+ } -+ -+ ret = abort_lock(g, abort); -+out: -+ if (ret) -+ while (g->nr) -+ lock_graph_up(g); -+ return ret; -+} -+ -+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, -+ struct printbuf *cycle) -+{ -+ struct btree_trans *orig_trans = g->g->trans; -+ struct trans_waiting_for_lock *i; -+ -+ for (i = g->g; i < g->g + g->nr; i++) -+ if (i->trans == trans) { -+ closure_put(&trans->ref); -+ return break_cycle(g, cycle); -+ } -+ -+ if (g->nr == ARRAY_SIZE(g->g)) { -+ closure_put(&trans->ref); -+ -+ if (orig_trans->lock_may_not_fail) -+ return 0; -+ -+ while (g->nr) -+ lock_graph_up(g); -+ -+ if (cycle) -+ return 0; -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); -+ return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); -+ } -+ -+ __lock_graph_down(g, trans); -+ return 0; -+} -+ -+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) -+{ -+ return t1 + t2 > 1; -+} -+ -+int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) -+{ -+ struct lock_graph g; -+ struct trans_waiting_for_lock *top; -+ struct btree_bkey_cached_common *b; -+ struct btree_path *path; -+ unsigned path_idx; -+ int ret; -+ -+ if (trans->lock_must_abort) { -+ if (cycle) -+ return -1; -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); -+ } -+ -+ g.nr = 0; -+ lock_graph_down(&g, trans); -+next: -+ if (!g.nr) -+ return 0; -+ -+ top = &g.g[g.nr - 1]; -+ -+ trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { -+ if (!path->nodes_locked) -+ continue; -+ -+ if (path_idx != top->path_idx) { -+ top->path_idx = path_idx; -+ top->level = 0; -+ top->lock_start_time = 0; -+ } -+ -+ for (; -+ top->level < BTREE_MAX_DEPTH; -+ top->level++, top->lock_start_time = 0) { -+ int lock_held = btree_node_locked_type(path, top->level); -+ -+ if (lock_held == BTREE_NODE_UNLOCKED) -+ continue; -+ -+ b = &READ_ONCE(path->l[top->level].b)->c; -+ -+ if (IS_ERR_OR_NULL(b)) { -+ /* -+ * If we get here, it means we raced with the -+ * other thread updating its btree_path -+ * structures - which means it can't be blocked -+ * waiting on a lock: -+ */ -+ if (!lock_graph_remove_non_waiters(&g)) { -+ /* -+ * If lock_graph_remove_non_waiters() -+ * didn't do anything, it must be -+ * because we're being called by debugfs -+ * checking for lock cycles, which -+ * invokes us on btree_transactions that -+ * aren't actually waiting on anything. -+ * Just bail out: -+ */ -+ lock_graph_pop_all(&g); -+ } -+ -+ goto next; -+ } -+ -+ if (list_empty_careful(&b->lock.wait_list)) -+ continue; -+ -+ raw_spin_lock(&b->lock.wait_lock); -+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { -+ BUG_ON(b != trans->locking); -+ -+ if (top->lock_start_time && -+ time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) -+ continue; -+ -+ top->lock_start_time = trans->locking_wait.start_time; -+ -+ /* Don't check for self deadlock: */ -+ if (trans == top->trans || -+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) -+ continue; -+ -+ closure_get(&trans->ref); -+ raw_spin_unlock(&b->lock.wait_lock); -+ -+ ret = lock_graph_descend(&g, trans, cycle); -+ if (ret) -+ return ret; -+ goto next; -+ -+ } -+ raw_spin_unlock(&b->lock.wait_lock); -+ } -+ } -+ -+ if (g.nr > 1 && cycle) -+ print_chain(cycle, &g); -+ lock_graph_up(&g); -+ goto next; -+} -+ -+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) -+{ -+ struct btree_trans *trans = p; -+ -+ return bch2_check_for_deadlock(trans, NULL); -+} -+ -+int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, -+ struct btree_bkey_cached_common *b, -+ bool lock_may_not_fail) -+{ -+ int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; -+ int ret; -+ -+ /* -+ * Must drop our read locks before calling six_lock_write() - -+ * six_unlock() won't do wakeups until the reader count -+ * goes to 0, and it's safe because we have the node intent -+ * locked: -+ */ -+ six_lock_readers_add(&b->lock, -readers); -+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, -+ lock_may_not_fail, _RET_IP_); -+ six_lock_readers_add(&b->lock, readers); -+ -+ if (ret) -+ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_INTENT_LOCKED); -+ -+ return ret; -+} -+ -+void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_bkey_cached_common *b) -+{ -+ struct btree_path *linked; -+ unsigned i; -+ int ret; -+ -+ /* -+ * XXX BIG FAT NOTICE -+ * -+ * Drop all read locks before taking a write lock: -+ * -+ * This is a hack, because bch2_btree_node_lock_write_nofail() is a -+ * hack - but by dropping read locks first, this should never fail, and -+ * we only use this in code paths where whatever read locks we've -+ * already taken are no longer needed: -+ */ -+ -+ trans_for_each_path(trans, linked) { -+ if (!linked->nodes_locked) -+ continue; -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ if (btree_node_read_locked(linked, i)) { -+ btree_node_unlock(trans, linked, i); -+ btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); -+ } -+ } -+ -+ ret = __btree_node_lock_write(trans, path, b, true); -+ BUG_ON(ret); -+} -+ -+/* relock */ -+ -+static inline bool btree_path_get_locks(struct btree_trans *trans, -+ struct btree_path *path, -+ bool upgrade) -+{ -+ unsigned l = path->level; -+ int fail_idx = -1; -+ -+ do { -+ if (!btree_path_node(path, l)) -+ break; -+ -+ if (!(upgrade -+ ? bch2_btree_node_upgrade(trans, path, l) -+ : bch2_btree_node_relock(trans, path, l))) -+ fail_idx = l; -+ -+ l++; -+ } while (l < path->locks_want); -+ -+ /* -+ * When we fail to get a lock, we have to ensure that any child nodes -+ * can't be relocked so bch2_btree_path_traverse has to walk back up to -+ * the node that we failed to relock: -+ */ -+ if (fail_idx >= 0) { -+ __bch2_btree_path_unlock(trans, path); -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ -+ do { -+ path->l[fail_idx].b = upgrade -+ ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) -+ : ERR_PTR(-BCH_ERR_no_btree_node_relock); -+ --fail_idx; -+ } while (fail_idx >= 0); -+ } -+ -+ if (path->uptodate == BTREE_ITER_NEED_RELOCK) -+ path->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_trans_verify_locks(trans); -+ -+ return path->uptodate < BTREE_ITER_NEED_RELOCK; -+} -+ -+bool __bch2_btree_node_relock(struct btree_trans *trans, -+ struct btree_path *path, unsigned level, -+ bool trace) -+{ -+ struct btree *b = btree_path_node(path, level); -+ int want = __btree_lock_want(path, level); -+ -+ if (race_fault()) -+ goto fail; -+ -+ if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || -+ (btree_node_lock_seq_matches(path, b, level) && -+ btree_node_lock_increment(trans, &b->c, level, want))) { -+ mark_btree_node_locked(trans, path, level, want); -+ return true; -+ } -+fail: -+ if (trace && !trans->notrace_relock_fail) -+ trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); -+ return false; -+} -+ -+/* upgrade */ -+ -+bool bch2_btree_node_upgrade(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+ struct btree *b = path->l[level].b; -+ struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); -+ -+ if (!is_btree_node(path, level)) -+ return false; -+ -+ switch (btree_lock_want(path, level)) { -+ case BTREE_NODE_UNLOCKED: -+ BUG_ON(btree_node_locked(path, level)); -+ return true; -+ case BTREE_NODE_READ_LOCKED: -+ BUG_ON(btree_node_intent_locked(path, level)); -+ return bch2_btree_node_relock(trans, path, level); -+ case BTREE_NODE_INTENT_LOCKED: -+ break; -+ case BTREE_NODE_WRITE_LOCKED: -+ BUG(); -+ } -+ -+ if (btree_node_intent_locked(path, level)) -+ return true; -+ -+ if (race_fault()) -+ return false; -+ -+ if (btree_node_locked(path, level)) { -+ bool ret; -+ -+ six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); -+ ret = six_lock_tryupgrade(&b->c.lock); -+ six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); -+ -+ if (ret) -+ goto success; -+ } else { -+ if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) -+ goto success; -+ } -+ -+ /* -+ * Do we already have an intent lock via another path? If so, just bump -+ * lock count: -+ */ -+ if (btree_node_lock_seq_matches(path, b, level) && -+ btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { -+ btree_node_unlock(trans, path, level); -+ goto success; -+ } -+ -+ trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); -+ return false; -+success: -+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); -+ return true; -+} -+ -+/* Btree path locking: */ -+ -+/* -+ * Only for btree_cache.c - only relocks intent locks -+ */ -+int bch2_btree_path_relock_intent(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ unsigned l; -+ -+ for (l = path->level; -+ l < path->locks_want && btree_path_node(path, l); -+ l++) { -+ if (!bch2_btree_node_relock(trans, path, l)) { -+ __bch2_btree_path_unlock(trans, path); -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+ trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); -+ } -+ } -+ -+ return 0; -+} -+ -+__flatten -+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, -+ struct btree_path *path, unsigned long trace_ip) -+{ -+ return btree_path_get_locks(trans, path, false); -+} -+ -+int __bch2_btree_path_relock(struct btree_trans *trans, -+ struct btree_path *path, unsigned long trace_ip) -+{ -+ if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { -+ trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); -+ } -+ -+ return 0; -+} -+ -+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_locks_want) -+{ -+ EBUG_ON(path->locks_want >= new_locks_want); -+ -+ path->locks_want = new_locks_want; -+ -+ return btree_path_get_locks(trans, path, true); -+} -+ -+bool __bch2_btree_path_upgrade(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_locks_want) -+{ -+ struct btree_path *linked; -+ -+ if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) -+ return true; -+ -+ /* -+ * XXX: this is ugly - we'd prefer to not be mucking with other -+ * iterators in the btree_trans here. -+ * -+ * On failure to upgrade the iterator, setting iter->locks_want and -+ * calling get_locks() is sufficient to make bch2_btree_path_traverse() -+ * get the locks we want on transaction restart. -+ * -+ * But if this iterator was a clone, on transaction restart what we did -+ * to this iterator isn't going to be preserved. -+ * -+ * Possibly we could add an iterator field for the parent iterator when -+ * an iterator is a copy - for now, we'll just upgrade any other -+ * iterators with the same btree id. -+ * -+ * The code below used to be needed to ensure ancestor nodes get locked -+ * before interior nodes - now that's handled by -+ * bch2_btree_path_traverse_all(). -+ */ -+ if (!path->cached && !trans->in_traverse_all) -+ trans_for_each_path(trans, linked) -+ if (linked != path && -+ linked->cached == path->cached && -+ linked->btree_id == path->btree_id && -+ linked->locks_want < new_locks_want) { -+ linked->locks_want = new_locks_want; -+ btree_path_get_locks(trans, linked, true); -+ } -+ -+ return false; -+} -+ -+void __bch2_btree_path_downgrade(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_locks_want) -+{ -+ unsigned l; -+ -+ EBUG_ON(path->locks_want < new_locks_want); -+ -+ path->locks_want = new_locks_want; -+ -+ while (path->nodes_locked && -+ (l = btree_path_highest_level_locked(path)) >= path->locks_want) { -+ if (l > path->level) { -+ btree_node_unlock(trans, path, l); -+ } else { -+ if (btree_node_intent_locked(path, l)) { -+ six_lock_downgrade(&path->l[l].b->c.lock); -+ mark_btree_node_locked_noreset(path, l, BTREE_NODE_READ_LOCKED); -+ } -+ break; -+ } -+ } -+ -+ bch2_btree_path_verify_locks(path); -+} -+ -+/* Btree transaction locking: */ -+ -+void bch2_trans_downgrade(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ bch2_btree_path_downgrade(trans, path); -+} -+ -+int bch2_trans_relock(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ if (unlikely(trans->restarted)) -+ return -((int) trans->restarted); -+ -+ trans_for_each_path(trans, path) -+ if (path->should_be_locked && -+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { -+ trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); -+ } -+ return 0; -+} -+ -+int bch2_trans_relock_notrace(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ if (unlikely(trans->restarted)) -+ return -((int) trans->restarted); -+ -+ trans_for_each_path(trans, path) -+ if (path->should_be_locked && -+ !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); -+ } -+ return 0; -+} -+ -+void bch2_trans_unlock_noassert(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ __bch2_btree_path_unlock(trans, path); -+} -+ -+void bch2_trans_unlock(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ __bch2_btree_path_unlock(trans, path); -+ -+ /* -+ * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking -+ * btree nodes, it implements its own walking: -+ */ -+ if (!trans->is_initial_gc) -+ bch2_assert_btree_nodes_not_locked(); -+} -+ -+bool bch2_trans_locked(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ if (path->nodes_locked) -+ return true; -+ return false; -+} -+ -+int __bch2_trans_mutex_lock(struct btree_trans *trans, -+ struct mutex *lock) -+{ -+ int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); -+ -+ if (ret) -+ mutex_unlock(lock); -+ return ret; -+} -+ -+/* Debug */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void bch2_btree_path_verify_locks(struct btree_path *path) -+{ -+ unsigned l; -+ -+ if (!path->nodes_locked) { -+ BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && -+ btree_path_node(path, path->level)); -+ return; -+ } -+ -+ for (l = 0; l < BTREE_MAX_DEPTH; l++) { -+ int want = btree_lock_want(path, l); -+ int have = btree_node_locked_type(path, l); -+ -+ BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); -+ -+ BUG_ON(is_btree_node(path, l) && -+ (want == BTREE_NODE_UNLOCKED || -+ have != BTREE_NODE_WRITE_LOCKED) && -+ want != have); -+ } -+} -+ -+void bch2_trans_verify_locks(struct btree_trans *trans) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ bch2_btree_path_verify_locks(path); -+} -+ -+#endif -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -new file mode 100644 -index 000000000..22e2cd391 ---- /dev/null -+++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,423 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_LOCKING_H -+#define _BCACHEFS_BTREE_LOCKING_H -+ -+/* -+ * Only for internal btree use: -+ * -+ * The btree iterator tracks what locks it wants to take, and what locks it -+ * currently has - here we have wrappers for locking/unlocking btree nodes and -+ * updating the iterator state -+ */ -+ -+#include "btree_iter.h" -+#include "six.h" -+ -+void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); -+ -+#ifdef CONFIG_LOCKDEP -+void bch2_assert_btree_nodes_not_locked(void); -+#else -+static inline void bch2_assert_btree_nodes_not_locked(void) {} -+#endif -+ -+void bch2_trans_unlock_noassert(struct btree_trans *); -+ -+static inline bool is_btree_node(struct btree_path *path, unsigned l) -+{ -+ return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); -+} -+ -+static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) -+{ -+ return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) -+ ? &trans->c->btree_transaction_stats[trans->fn_idx] -+ : NULL; -+} -+ -+/* matches six lock types */ -+enum btree_node_locked_type { -+ BTREE_NODE_UNLOCKED = -1, -+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, -+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, -+ BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, -+}; -+ -+static inline int btree_node_locked_type(struct btree_path *path, -+ unsigned level) -+{ -+ return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); -+} -+ -+static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) -+{ -+ return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; -+} -+ -+static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) -+{ -+ return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; -+} -+ -+static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) -+{ -+ return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; -+} -+ -+static inline bool btree_node_locked(struct btree_path *path, unsigned level) -+{ -+ return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; -+} -+ -+static inline void mark_btree_node_locked_noreset(struct btree_path *path, -+ unsigned level, -+ enum btree_node_locked_type type) -+{ -+ /* relying on this to avoid a branch */ -+ BUILD_BUG_ON(SIX_LOCK_read != 0); -+ BUILD_BUG_ON(SIX_LOCK_intent != 1); -+ -+ path->nodes_locked &= ~(3U << (level << 1)); -+ path->nodes_locked |= (type + 1) << (level << 1); -+} -+ -+static inline void mark_btree_node_unlocked(struct btree_path *path, -+ unsigned level) -+{ -+ EBUG_ON(btree_node_write_locked(path, level)); -+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); -+} -+ -+static inline void mark_btree_node_locked(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned level, -+ enum six_lock_type type) -+{ -+ mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); -+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ path->l[level].lock_taken_time = local_clock(); -+#endif -+} -+ -+static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) -+{ -+ return level < path->locks_want -+ ? SIX_LOCK_intent -+ : SIX_LOCK_read; -+} -+ -+static inline enum btree_node_locked_type -+btree_lock_want(struct btree_path *path, int level) -+{ -+ if (level < path->level) -+ return BTREE_NODE_UNLOCKED; -+ if (level < path->locks_want) -+ return BTREE_NODE_INTENT_LOCKED; -+ if (level == path->level) -+ return BTREE_NODE_READ_LOCKED; -+ return BTREE_NODE_UNLOCKED; -+} -+ -+static void btree_trans_lock_hold_time_update(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ struct btree_transaction_stats *s = btree_trans_stats(trans); -+ -+ if (s) -+ __bch2_time_stats_update(&s->lock_hold_times, -+ path->l[level].lock_taken_time, -+ local_clock()); -+#endif -+} -+ -+/* unlock: */ -+ -+static inline void btree_node_unlock(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+ int lock_type = btree_node_locked_type(path, level); -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ if (lock_type != BTREE_NODE_UNLOCKED) { -+ six_unlock_type(&path->l[level].b->c.lock, lock_type); -+ btree_trans_lock_hold_time_update(trans, path, level); -+ } -+ mark_btree_node_unlocked(path, level); -+} -+ -+static inline int btree_path_lowest_level_locked(struct btree_path *path) -+{ -+ return __ffs(path->nodes_locked) >> 1; -+} -+ -+static inline int btree_path_highest_level_locked(struct btree_path *path) -+{ -+ return __fls(path->nodes_locked) >> 1; -+} -+ -+static inline void __bch2_btree_path_unlock(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); -+ -+ while (path->nodes_locked) -+ btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); -+} -+ -+/* -+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will -+ * succeed: -+ */ -+static inline void -+bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, -+ struct btree *b) -+{ -+ struct btree_path *linked; -+ -+ EBUG_ON(path->l[b->c.level].b != b); -+ EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); -+ EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); -+ -+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); -+ -+ trans_for_each_path_with_node(trans, b, linked) -+ linked->l[b->c.level].lock_seq++; -+ -+ six_unlock_write(&b->c.lock); -+} -+ -+void bch2_btree_node_unlock_write(struct btree_trans *, -+ struct btree_path *, struct btree *); -+ -+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); -+ -+/* lock: */ -+ -+static inline int __btree_node_lock_nopath(struct btree_trans *trans, -+ struct btree_bkey_cached_common *b, -+ enum six_lock_type type, -+ bool lock_may_not_fail, -+ unsigned long ip) -+{ -+ int ret; -+ -+ trans->lock_may_not_fail = lock_may_not_fail; -+ trans->lock_must_abort = false; -+ trans->locking = b; -+ -+ ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, -+ bch2_six_check_for_deadlock, trans, ip); -+ WRITE_ONCE(trans->locking, NULL); -+ WRITE_ONCE(trans->locking_wait.start_time, 0); -+ return ret; -+} -+ -+static inline int __must_check -+btree_node_lock_nopath(struct btree_trans *trans, -+ struct btree_bkey_cached_common *b, -+ enum six_lock_type type, -+ unsigned long ip) -+{ -+ return __btree_node_lock_nopath(trans, b, type, false, ip); -+} -+ -+static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, -+ struct btree_bkey_cached_common *b, -+ enum six_lock_type type) -+{ -+ int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); -+ -+ BUG_ON(ret); -+} -+ -+/* -+ * Lock a btree node if we already have it locked on one of our linked -+ * iterators: -+ */ -+static inline bool btree_node_lock_increment(struct btree_trans *trans, -+ struct btree_bkey_cached_common *b, -+ unsigned level, -+ enum btree_node_locked_type want) -+{ -+ struct btree_path *path; -+ -+ trans_for_each_path(trans, path) -+ if (&path->l[level].b->c == b && -+ btree_node_locked_type(path, level) >= want) { -+ six_lock_increment(&b->lock, (enum six_lock_type) want); -+ return true; -+ } -+ -+ return false; -+} -+ -+static inline int btree_node_lock(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_bkey_cached_common *b, -+ unsigned level, -+ enum six_lock_type type, -+ unsigned long ip) -+{ -+ int ret = 0; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); -+ -+ if (likely(six_trylock_type(&b->lock, type)) || -+ btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || -+ !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { -+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ path->l[b->level].lock_taken_time = local_clock(); -+#endif -+ } -+ -+ return ret; -+} -+ -+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, -+ struct btree_bkey_cached_common *b, bool); -+ -+static inline int __btree_node_lock_write(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_bkey_cached_common *b, -+ bool lock_may_not_fail) -+{ -+ EBUG_ON(&path->l[b->level].b->c != b); -+ EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); -+ EBUG_ON(!btree_node_intent_locked(path, b->level)); -+ -+ /* -+ * six locks are unfair, and read locks block while a thread wants a -+ * write lock: thus, we need to tell the cycle detector we have a write -+ * lock _before_ taking the lock: -+ */ -+ mark_btree_node_locked_noreset(path, b->level, BTREE_NODE_WRITE_LOCKED); -+ -+ return likely(six_trylock_write(&b->lock)) -+ ? 0 -+ : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); -+} -+ -+static inline int __must_check -+bch2_btree_node_lock_write(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_bkey_cached_common *b) -+{ -+ return __btree_node_lock_write(trans, path, b, false); -+} -+ -+void bch2_btree_node_lock_write_nofail(struct btree_trans *, -+ struct btree_path *, -+ struct btree_bkey_cached_common *); -+ -+/* relock: */ -+ -+bool bch2_btree_path_relock_norestart(struct btree_trans *, -+ struct btree_path *, unsigned long); -+int __bch2_btree_path_relock(struct btree_trans *, -+ struct btree_path *, unsigned long); -+ -+static inline int bch2_btree_path_relock(struct btree_trans *trans, -+ struct btree_path *path, unsigned long trace_ip) -+{ -+ return btree_node_locked(path, path->level) -+ ? 0 -+ : __bch2_btree_path_relock(trans, path, trace_ip); -+} -+ -+bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); -+ -+static inline bool bch2_btree_node_relock(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+ EBUG_ON(btree_node_locked(path, level) && -+ !btree_node_write_locked(path, level) && -+ btree_node_locked_type(path, level) != __btree_lock_want(path, level)); -+ -+ return likely(btree_node_locked(path, level)) || -+ (!IS_ERR_OR_NULL(path->l[level].b) && -+ __bch2_btree_node_relock(trans, path, level, true)); -+} -+ -+static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, -+ struct btree_path *path, unsigned level) -+{ -+ EBUG_ON(btree_node_locked(path, level) && -+ !btree_node_write_locked(path, level) && -+ btree_node_locked_type(path, level) != __btree_lock_want(path, level)); -+ -+ return likely(btree_node_locked(path, level)) || -+ (!IS_ERR_OR_NULL(path->l[level].b) && -+ __bch2_btree_node_relock(trans, path, level, false)); -+} -+ -+/* upgrade */ -+ -+bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, -+ struct btree_path *, unsigned); -+bool __bch2_btree_path_upgrade(struct btree_trans *, -+ struct btree_path *, unsigned); -+ -+static inline int bch2_btree_path_upgrade(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned new_locks_want) -+{ -+ unsigned old_locks_want = path->locks_want; -+ -+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); -+ -+ if (path->locks_want < new_locks_want -+ ? __bch2_btree_path_upgrade(trans, path, new_locks_want) -+ : path->uptodate == BTREE_ITER_UPTODATE) -+ return 0; -+ -+ trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, -+ old_locks_want, new_locks_want); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); -+} -+ -+/* misc: */ -+ -+static inline void btree_path_set_should_be_locked(struct btree_path *path) -+{ -+ EBUG_ON(!btree_node_locked(path, path->level)); -+ EBUG_ON(path->uptodate); -+ -+ path->should_be_locked = true; -+} -+ -+static inline void __btree_path_set_level_up(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned l) -+{ -+ btree_node_unlock(trans, path, l); -+ path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); -+} -+ -+static inline void btree_path_set_level_up(struct btree_trans *trans, -+ struct btree_path *path) -+{ -+ __btree_path_set_level_up(trans, path, path->level++); -+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); -+} -+ -+/* debug */ -+ -+struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, -+ struct btree_path *, -+ struct btree_bkey_cached_common *b, -+ unsigned); -+ -+int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_path_verify_locks(struct btree_path *); -+void bch2_trans_verify_locks(struct btree_trans *); -+#else -+static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -+static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} -+#endif -+ -+#endif /* _BCACHEFS_BTREE_LOCKING_H */ -diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c -new file mode 100644 -index 000000000..eafb0388e ---- /dev/null -+++ b/fs/bcachefs/btree_trans_commit.c -@@ -0,0 +1,1156 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_journal_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update_interior.h" -+#include "btree_write_buffer.h" -+#include "buckets.h" -+#include "errcode.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+#include "snapshot.h" -+ -+#include -+ -+static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bch_fs *c = trans->c; -+ struct bkey u; -+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ struct bkey_i *j_k = -+ bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); -+ -+ if (j_k) -+ k = bkey_i_to_s_c(j_k); -+ } -+ -+ u = *k.k; -+ u.needs_whiteout = i->old_k.needs_whiteout; -+ -+ BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); -+ BUG_ON(i->old_v != k.v); -+#endif -+} -+ -+static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) -+{ -+ return i->path->l + i->level; -+} -+ -+static inline bool same_leaf_as_prev(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i != trans->updates && -+ insert_l(&i[0])->b == insert_l(&i[-1])->b; -+} -+ -+static inline bool same_leaf_as_next(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i + 1 < trans->updates + trans->nr_updates && -+ insert_l(&i[0])->b == insert_l(&i[1])->b; -+} -+ -+inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (unlikely(btree_node_just_written(b)) && -+ bch2_btree_post_write_cleanup(c, b)) -+ bch2_trans_node_reinit_iter(trans, b); -+ -+ /* -+ * If the last bset has been written, or if it's gotten too big - start -+ * a new bset to insert into: -+ */ -+ if (want_new_bset(c, b)) -+ bch2_btree_init_next(trans, b); -+} -+ -+/* Inserting into a given leaf node (last stage of insert): */ -+ -+/* Handle overwrites and do insert, for non extents: */ -+bool bch2_btree_bset_insert_key(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bkey_packed *k; -+ unsigned clobber_u64s = 0, new_u64s = 0; -+ -+ EBUG_ON(btree_node_just_written(b)); -+ EBUG_ON(bset_written(b, btree_bset_last(b))); -+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); -+ EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); -+ EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); -+ EBUG_ON(insert->k.u64s > -+ bch_btree_keys_u64s_remaining(trans->c, b)); -+ EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) -+ k = NULL; -+ -+ /* @k is the key being overwritten/deleted, if any: */ -+ EBUG_ON(k && bkey_deleted(k)); -+ -+ /* Deleting, but not found? nothing to do: */ -+ if (bkey_deleted(&insert->k) && !k) -+ return false; -+ -+ if (bkey_deleted(&insert->k)) { -+ /* Deleting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ if (k->needs_whiteout) -+ push_whiteout(trans->c, b, insert->k.p); -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ bch2_bset_delete(b, k, clobber_u64s); -+ goto fix_iter; -+ } else { -+ bch2_btree_path_fix_key_modified(trans, b, k); -+ } -+ -+ return true; -+ } -+ -+ if (k) { -+ /* Overwriting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ insert->k.needs_whiteout = k->needs_whiteout; -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ goto overwrite; -+ } else { -+ bch2_btree_path_fix_key_modified(trans, b, k); -+ } -+ } -+ -+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -+overwrite: -+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); -+ new_u64s = k->u64s; -+fix_iter: -+ if (clobber_u64s != new_u64s) -+ bch2_btree_node_iter_fix(trans, path, b, node_iter, k, -+ clobber_u64s, new_u64s); -+ return true; -+} -+ -+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, -+ unsigned i, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write *w = container_of(pin, struct btree_write, journal); -+ struct btree *b = container_of(w, struct btree, writes[i]); -+ struct btree_trans trans; -+ unsigned long old, new, v; -+ unsigned idx = w - b->writes; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ v = READ_ONCE(b->flags); -+ -+ do { -+ old = new = v; -+ -+ if (!(old & (1 << BTREE_NODE_dirty)) || -+ !!(old & (1 << BTREE_NODE_write_idx)) != idx || -+ w->journal.seq != seq) -+ break; -+ -+ new &= ~BTREE_WRITE_TYPE_MASK; -+ new |= BTREE_WRITE_journal_reclaim; -+ new |= 1 << BTREE_NODE_need_write; -+ } while ((v = cmpxchg(&b->flags, old, new)) != old); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ -+ bch2_trans_exit(&trans); -+ return 0; -+} -+ -+int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 0, seq); -+} -+ -+int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 1, seq); -+} -+ -+inline void bch2_btree_add_journal_pin(struct bch_fs *c, -+ struct btree *b, u64 seq) -+{ -+ struct btree_write *w = btree_current_write(b); -+ -+ bch2_journal_pin_add(&c->journal, seq, &w->journal, -+ btree_node_write_idx(b) == 0 -+ ? bch2_btree_node_flush0 -+ : bch2_btree_node_flush1); -+} -+ -+/** -+ * btree_insert_key - insert a key one key into a leaf node -+ */ -+inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, -+ struct btree_path *path, -+ struct bkey_i *insert, -+ u64 journal_seq) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = path_l(path)->b; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bset *i = bset(b, t); -+ int old_u64s = bset_u64s(t); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, -+ &path_l(path)->iter, insert))) -+ return; -+ -+ i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ -+ if (unlikely(!btree_node_dirty(b))) { -+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); -+ set_btree_node_dirty_acct(c, b); -+ } -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) bset_u64s(t) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_trans_node_reinit_iter(trans, b); -+} -+ -+/* Cached btree updates: */ -+ -+/* Normal update interface: */ -+ -+static inline void btree_insert_entry_checks(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); -+ BUG_ON(i->cached != i->path->cached); -+ BUG_ON(i->level != i->path->level); -+ BUG_ON(i->btree_id != i->path->btree_id); -+ EBUG_ON(!i->level && -+ !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && -+ test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && -+ i->k->k.p.snapshot && -+ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); -+} -+ -+static noinline int -+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, -+ unsigned long trace_ip) -+{ -+ return drop_locks_do(trans, -+ bch2_journal_preres_get(&trans->c->journal, -+ &trans->journal_preres, -+ trans->journal_preres_u64s, -+ (flags & BCH_WATERMARK_MASK))); -+} -+ -+static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, -+ unsigned flags) -+{ -+ return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); -+} -+ -+#define JSET_ENTRY_LOG_U64s 4 -+ -+static noinline void journal_transaction_name(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct jset_entry *entry = -+ bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_log, 0, 0, -+ JSET_ENTRY_LOG_U64s); -+ struct jset_entry_log *l = -+ container_of(entry, struct jset_entry_log, entry); -+ -+ strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); -+} -+ -+static inline int btree_key_can_insert(struct btree_trans *trans, -+ struct btree *b, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ -+ if (!bch2_btree_node_insert_fits(c, b, u64s)) -+ return -BCH_ERR_btree_insert_btree_node_full; -+ -+ return 0; -+} -+ -+static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, -+ struct btree_path *path, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) path->l[0].b; -+ struct btree_insert_entry *i; -+ unsigned new_u64s; -+ struct bkey_i *new_k; -+ -+ EBUG_ON(path->level); -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && -+ bch2_btree_key_cache_must_wait(c) && -+ !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) -+ return -BCH_ERR_btree_insert_need_journal_reclaim; -+ -+ /* -+ * bch2_varint_decode can read past the end of the buffer by at most 7 -+ * bytes (it won't be used): -+ */ -+ u64s += 1; -+ -+ if (u64s <= ck->u64s) -+ return 0; -+ -+ new_u64s = roundup_pow_of_two(u64s); -+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) { -+ bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", -+ bch2_btree_ids[path->btree_id], new_u64s); -+ return -BCH_ERR_ENOMEM_btree_key_cache_insert; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (i->old_v == &ck->k->v) -+ i->old_v = &new_k->v; -+ -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ return 0; -+} -+ -+/* Triggers: */ -+ -+static int run_one_mem_trigger(struct btree_trans *trans, -+ struct btree_insert_entry *i, -+ unsigned flags) -+{ -+ struct bkey_s_c old = { &i->old_k, i->old_v }; -+ struct bkey_i *new = i->k; -+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); -+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); -+ int ret; -+ -+ verify_update_old_key(trans, i); -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc((enum btree_node_type) i->btree_id)) -+ return 0; -+ -+ if (old_ops->atomic_trigger == new_ops->atomic_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ ret = bch2_mark_key(trans, i->btree_id, i->level, -+ old, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); -+ } else { -+ struct bkey _deleted = KEY(0, 0, 0); -+ struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; -+ -+ _deleted.p = i->path->pos; -+ -+ ret = bch2_mark_key(trans, i->btree_id, i->level, -+ deleted, bkey_i_to_s_c(new), -+ BTREE_TRIGGER_INSERT|flags) ?: -+ bch2_mark_key(trans, i->btree_id, i->level, -+ old, deleted, -+ BTREE_TRIGGER_OVERWRITE|flags); -+ } -+ -+ return ret; -+} -+ -+static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, -+ bool overwrite) -+{ -+ /* -+ * Transactional triggers create new btree_insert_entries, so we can't -+ * pass them a pointer to a btree_insert_entry, that memory is going to -+ * move: -+ */ -+ struct bkey old_k = i->old_k; -+ struct bkey_s_c old = { &old_k, i->old_v }; -+ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); -+ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); -+ -+ verify_update_old_key(trans, i); -+ -+ if ((i->flags & BTREE_TRIGGER_NORUN) || -+ !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) -+ return 0; -+ -+ if (!i->insert_trigger_run && -+ !i->overwrite_trigger_run && -+ old_ops->trans_trigger == new_ops->trans_trigger && -+ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { -+ i->overwrite_trigger_run = true; -+ i->insert_trigger_run = true; -+ return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, -+ BTREE_TRIGGER_INSERT| -+ BTREE_TRIGGER_OVERWRITE| -+ i->flags) ?: 1; -+ } else if (overwrite && !i->overwrite_trigger_run) { -+ i->overwrite_trigger_run = true; -+ return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; -+ } else if (!overwrite && !i->insert_trigger_run) { -+ i->insert_trigger_run = true; -+ return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; -+ } else { -+ return 0; -+ } -+} -+ -+static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, -+ struct btree_insert_entry *btree_id_start) -+{ -+ struct btree_insert_entry *i; -+ bool trans_trigger_run; -+ int ret, overwrite; -+ -+ for (overwrite = 1; overwrite >= 0; --overwrite) { -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ for (i = btree_id_start; -+ i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; -+ i++) { -+ if (i->btree_id != btree_id) -+ continue; -+ -+ ret = run_one_trans_trigger(trans, i, overwrite); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ trans_trigger_run = true; -+ } -+ } while (trans_trigger_run); -+ } -+ -+ return 0; -+} -+ -+static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; -+ unsigned btree_id = 0; -+ int ret = 0; -+ -+ /* -+ * -+ * For a given btree, this algorithm runs insert triggers before -+ * overwrite triggers: this is so that when extents are being moved -+ * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before -+ * they are re-added. -+ */ -+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { -+ if (btree_id == BTREE_ID_alloc) -+ continue; -+ -+ while (btree_id_start < trans->updates + trans->nr_updates && -+ btree_id_start->btree_id < btree_id) -+ btree_id_start++; -+ -+ ret = run_btree_triggers(trans, btree_id, btree_id_start); -+ if (ret) -+ return ret; -+ } -+ -+ trans_for_each_update(trans, i) { -+ if (i->btree_id > BTREE_ID_alloc) -+ break; -+ if (i->btree_id == BTREE_ID_alloc) { -+ ret = run_btree_triggers(trans, BTREE_ID_alloc, i); -+ if (ret) -+ return ret; -+ break; -+ } -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && -+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && -+ (!i->insert_trigger_run || !i->overwrite_trigger_run)); -+#endif -+ return 0; -+} -+ -+static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0; -+ -+ trans_for_each_update(trans, i) { -+ /* -+ * XXX: synchronization of cached update triggers with gc -+ * XXX: synchronization of interior node updates with gc -+ */ -+ BUG_ON(i->cached || i->level); -+ -+ if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { -+ ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static inline int -+bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry **stopped_at, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ struct btree_write_buffered_key *wb; -+ struct btree_trans_commit_hook *h; -+ unsigned u64s = 0; -+ bool marking = false; -+ int ret; -+ -+ if (race_fault()) { -+ trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); -+ return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); -+ } -+ -+ /* -+ * Check if the insert will fit in the leaf node with the write lock -+ * held, otherwise another thread could write the node changing the -+ * amount of space available: -+ */ -+ -+ prefetch(&trans->c->journal.flags); -+ -+ trans_for_each_update(trans, i) { -+ /* Multiple inserts might go to same leaf: */ -+ if (!same_leaf_as_prev(trans, i)) -+ u64s = 0; -+ -+ u64s += i->k->k.u64s; -+ ret = !i->cached -+ ? btree_key_can_insert(trans, insert_l(i)->b, u64s) -+ : btree_key_can_insert_cached(trans, flags, i->path, u64s); -+ if (ret) { -+ *stopped_at = i; -+ return ret; -+ } -+ -+ if (btree_node_type_needs_gc(i->bkey_type)) -+ marking = true; -+ } -+ -+ if (trans->nr_wb_updates && -+ trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) -+ return -BCH_ERR_btree_insert_need_flush_buffer; -+ -+ /* -+ * Don't get journal reservation until after we know insert will -+ * succeed: -+ */ -+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ ret = bch2_trans_journal_res_get(trans, -+ (flags & BCH_WATERMARK_MASK)| -+ JOURNAL_RES_GET_NONBLOCK); -+ if (ret) -+ return ret; -+ -+ if (unlikely(trans->journal_transaction_names)) -+ journal_transaction_name(trans); -+ } else { -+ trans->journal_res.seq = c->journal.replay_journal_seq; -+ } -+ -+ /* -+ * Not allowed to fail after we've gotten our journal reservation - we -+ * have to use it: -+ */ -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -+ !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (bch2_journal_seq_verify) -+ trans_for_each_update(trans, i) -+ i->k->k.version.lo = trans->journal_res.seq; -+ else if (bch2_inject_invalid_keys) -+ trans_for_each_update(trans, i) -+ i->k->k.version = MAX_VERSION; -+ } -+ -+ if (trans->fs_usage_deltas && -+ bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) -+ return -BCH_ERR_btree_insert_need_mark_replicas; -+ -+ if (trans->nr_wb_updates) { -+ EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); -+ -+ ret = bch2_btree_insert_keys_write_buffer(trans); -+ if (ret) -+ goto revert_fs_usage; -+ } -+ -+ h = trans->hooks; -+ while (h) { -+ ret = h->fn(trans, h); -+ if (ret) -+ goto revert_fs_usage; -+ h = h->next; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { -+ ret = run_one_mem_trigger(trans, i, i->flags); -+ if (ret) -+ goto fatal_err; -+ } -+ -+ if (unlikely(c->gc_pos.phase)) { -+ ret = bch2_trans_commit_run_gc_triggers(trans); -+ if (ret) -+ goto fatal_err; -+ } -+ -+ if (unlikely(trans->extra_journal_entries.nr)) { -+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries.data, -+ trans->extra_journal_entries.nr); -+ -+ trans->journal_res.offset += trans->extra_journal_entries.nr; -+ trans->journal_res.u64s -= trans->extra_journal_entries.nr; -+ } -+ -+ if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ struct journal *j = &c->journal; -+ struct jset_entry *entry; -+ -+ trans_for_each_update(trans, i) { -+ if (i->key_cache_already_flushed) -+ continue; -+ -+ if (i->flags & BTREE_UPDATE_NOJOURNAL) -+ continue; -+ -+ verify_update_old_key(trans, i); -+ -+ if (trans->journal_transaction_names) { -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_overwrite, -+ i->btree_id, i->level, -+ i->old_k.u64s); -+ bkey_reassemble(&entry->start[0], -+ (struct bkey_s_c) { &i->old_k, i->old_v }); -+ } -+ -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_btree_keys, -+ i->btree_id, i->level, -+ i->k->k.u64s); -+ bkey_copy(&entry->start[0], i->k); -+ } -+ -+ trans_for_each_wb_update(trans, wb) { -+ entry = bch2_journal_add_entry(j, &trans->journal_res, -+ BCH_JSET_ENTRY_btree_keys, -+ wb->btree, 0, -+ wb->k.k.u64s); -+ bkey_copy(&entry->start[0], &wb->k); -+ } -+ -+ if (trans->journal_seq) -+ *trans->journal_seq = trans->journal_res.seq; -+ } -+ -+ trans_for_each_update(trans, i) { -+ i->k->k.needs_whiteout = false; -+ -+ if (!i->cached) { -+ u64 seq = trans->journal_res.seq; -+ -+ if (i->flags & BTREE_UPDATE_PREJOURNAL) -+ seq = i->seq; -+ -+ bch2_btree_insert_key_leaf(trans, i->path, i->k, seq); -+ } else if (!i->key_cache_already_flushed) -+ bch2_btree_insert_key_cached(trans, flags, i); -+ else { -+ bch2_btree_key_cache_drop(trans, i->path); -+ btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); -+ } -+ } -+ -+ return 0; -+fatal_err: -+ bch2_fatal_error(c); -+revert_fs_usage: -+ if (trans->fs_usage_deltas) -+ bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); -+ return ret; -+} -+ -+static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) -+{ -+ while (--i >= trans->updates) { -+ if (same_leaf_as_prev(trans, i)) -+ continue; -+ -+ bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); -+ } -+ -+ trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); -+} -+ -+static inline int trans_lock_write(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) { -+ if (same_leaf_as_prev(trans, i)) -+ continue; -+ -+ if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) -+ return trans_lock_write_fail(trans, i); -+ -+ if (!i->cached) -+ bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); -+ } -+ -+ return 0; -+} -+ -+static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ struct btree_write_buffered_key *wb; -+ -+ trans_for_each_update(trans, i) -+ bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); -+ -+ trans_for_each_wb_update(trans, wb) -+ bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry *i, -+ struct printbuf *err) -+{ -+ struct bch_fs *c = trans->c; -+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; -+ -+ printbuf_reset(err); -+ prt_printf(err, "invalid bkey on insert from %s -> %ps", -+ trans->fn, (void *) i->ip_allocated); -+ prt_newline(err); -+ printbuf_indent_add(err, 2); -+ -+ bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); -+ prt_newline(err); -+ -+ bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, err); -+ bch2_print_string_as_lines(KERN_ERR, err->buf); -+ -+ bch2_inconsistent_error(c); -+ bch2_dump_trans_updates(trans); -+ -+ return -EINVAL; -+} -+#endif -+ -+/* -+ * Get journal reservation, take write locks, and attempt to do btree update(s): -+ */ -+static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry **stopped_at, -+ unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0, u64s_delta = 0; -+ -+ trans_for_each_update(trans, i) { -+ if (i->cached) -+ continue; -+ -+ u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; -+ u64s_delta -= i->old_btree_u64s; -+ -+ if (!same_leaf_as_next(trans, i)) { -+ if (u64s_delta <= 0) { -+ ret = bch2_foreground_maybe_merge(trans, i->path, -+ i->level, flags); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ u64s_delta = 0; -+ } -+ } -+ -+ ret = bch2_journal_preres_get(&c->journal, -+ &trans->journal_preres, trans->journal_preres_u64s, -+ (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); -+ if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) -+ ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); -+ if (unlikely(ret)) -+ return ret; -+ -+ ret = trans_lock_write(trans); -+ if (unlikely(ret)) -+ return ret; -+ -+ ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); -+ -+ if (!ret && unlikely(trans->journal_replay_not_finished)) -+ bch2_drop_overwrites_from_journal(trans); -+ -+ trans_for_each_update(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_unlock_write_inlined(trans, i->path, -+ insert_l(i)->b); -+ -+ if (!ret && trans->journal_pin) -+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, -+ trans->journal_pin, NULL); -+ -+ /* -+ * Drop journal reservation after dropping write locks, since dropping -+ * the journal reservation may kick off a journal write: -+ */ -+ bch2_journal_res_put(&c->journal, &trans->journal_res); -+ -+ if (unlikely(ret)) -+ return ret; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; -+} -+ -+static int journal_reclaim_wait_done(struct bch_fs *c) -+{ -+ int ret = bch2_journal_error(&c->journal) ?: -+ !bch2_btree_key_cache_must_wait(c); -+ -+ if (!ret) -+ journal_reclaim_kick(&c->journal); -+ return ret; -+} -+ -+static noinline -+int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, -+ struct btree_insert_entry *i, -+ int ret, unsigned long trace_ip) -+{ -+ struct bch_fs *c = trans->c; -+ -+ switch (ret) { -+ case -BCH_ERR_btree_insert_btree_node_full: -+ ret = bch2_btree_split_leaf(trans, i->path, flags); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); -+ break; -+ case -BCH_ERR_btree_insert_need_mark_replicas: -+ ret = drop_locks_do(trans, -+ bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); -+ break; -+ case -BCH_ERR_journal_res_get_blocked: -+ /* -+ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK -+ * flag -+ */ -+ if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && -+ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { -+ ret = -BCH_ERR_journal_reclaim_would_deadlock; -+ break; -+ } -+ -+ ret = drop_locks_do(trans, -+ bch2_trans_journal_res_get(trans, -+ (flags & BCH_WATERMARK_MASK)| -+ JOURNAL_RES_GET_CHECK)); -+ break; -+ case -BCH_ERR_btree_insert_need_journal_reclaim: -+ bch2_trans_unlock(trans); -+ -+ trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); -+ -+ wait_event_freezable(c->journal.reclaim_wait, -+ (ret = journal_reclaim_wait_done(c))); -+ if (ret < 0) -+ break; -+ -+ ret = bch2_trans_relock(trans); -+ break; -+ case -BCH_ERR_btree_insert_need_flush_buffer: { -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ -+ ret = 0; -+ -+ if (wb->state.nr > wb->size * 3 / 4) { -+ bch2_trans_unlock(trans); -+ mutex_lock(&wb->flush_lock); -+ -+ if (wb->state.nr > wb->size * 3 / 4) { -+ bch2_trans_begin(trans); -+ ret = __bch2_btree_write_buffer_flush(trans, -+ flags|BTREE_INSERT_NOCHECK_RW, true); -+ if (!ret) { -+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); -+ } -+ } else { -+ mutex_unlock(&wb->flush_lock); -+ ret = bch2_trans_relock(trans); -+ } -+ } -+ break; -+ } -+ default: -+ BUG_ON(ret >= 0); -+ break; -+ } -+ -+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); -+ -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && -+ !(flags & BTREE_INSERT_NOWAIT) && -+ (flags & BTREE_INSERT_NOFAIL), c, -+ "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); -+ -+ return ret; -+} -+ -+static noinline int -+bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || -+ test_bit(BCH_FS_STARTED, &c->flags)) -+ return -BCH_ERR_erofs_trans_commit; -+ -+ ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); -+ if (ret) -+ return ret; -+ -+ bch2_write_ref_get(c, BCH_WRITE_REF_trans); -+ return 0; -+} -+ -+/* -+ * This is for updates done in the early part of fsck - btree_gc - before we've -+ * gone RW. we only add the new key to the list of keys for journal replay to -+ * do. -+ */ -+static noinline int -+do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ int ret = 0; -+ -+ trans_for_each_update(trans, i) { -+ ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i = NULL; -+ struct btree_write_buffered_key *wb; -+ unsigned u64s; -+ int ret = 0; -+ -+ if (!trans->nr_updates && -+ !trans->nr_wb_updates && -+ !trans->extra_journal_entries.nr) -+ goto out_reset; -+ -+ if (flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&c->gc_lock); -+ -+ ret = bch2_trans_commit_run_triggers(trans); -+ if (ret) -+ goto out_reset; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) { -+ struct printbuf buf = PRINTBUF; -+ enum bkey_invalid_flags invalid_flags = 0; -+ -+ if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) -+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; -+ -+ if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, invalid_flags, &buf))) -+ ret = bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); -+ btree_insert_entry_checks(trans, i); -+ printbuf_exit(&buf); -+ -+ if (ret) -+ return ret; -+ } -+#endif -+ -+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { -+ ret = do_bch2_trans_commit_to_journal_replay(trans); -+ goto out_reset; -+ } -+ -+ if (!(flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { -+ ret = bch2_trans_commit_get_rw_cold(trans, flags); -+ if (ret) -+ goto out_reset; -+ } -+ -+ if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && -+ mutex_trylock(&c->btree_write_buffer.flush_lock)) { -+ bch2_trans_begin(trans); -+ bch2_trans_unlock(trans); -+ -+ ret = __bch2_btree_write_buffer_flush(trans, -+ flags|BTREE_INSERT_NOCHECK_RW, true); -+ if (!ret) { -+ trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); -+ } -+ goto out; -+ } -+ -+ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); -+ -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); -+ -+ trans->journal_u64s = trans->extra_journal_entries.nr; -+ trans->journal_preres_u64s = 0; -+ -+ trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); -+ -+ if (trans->journal_transaction_names) -+ trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); -+ -+ trans_for_each_update(trans, i) { -+ EBUG_ON(!i->path->should_be_locked); -+ -+ ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); -+ if (unlikely(ret)) -+ goto out; -+ -+ EBUG_ON(!btree_node_intent_locked(i->path, i->level)); -+ -+ if (i->key_cache_already_flushed) -+ continue; -+ -+ /* we're going to journal the key being updated: */ -+ u64s = jset_u64s(i->k->k.u64s); -+ if (i->cached && -+ likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) -+ trans->journal_preres_u64s += u64s; -+ -+ if (i->flags & BTREE_UPDATE_NOJOURNAL) -+ continue; -+ -+ trans->journal_u64s += u64s; -+ -+ /* and we're also going to log the overwrite: */ -+ if (trans->journal_transaction_names) -+ trans->journal_u64s += jset_u64s(i->old_k.u64s); -+ } -+ -+ trans_for_each_wb_update(trans, wb) -+ trans->journal_u64s += jset_u64s(wb->k.k.u64s); -+ -+ if (trans->extra_journal_res) { -+ ret = bch2_disk_reservation_add(c, trans->disk_res, -+ trans->extra_journal_res, -+ (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0); -+ if (ret) -+ goto err; -+ } -+retry: -+ bch2_trans_verify_not_in_restart(trans); -+ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); -+ -+ ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); -+ -+ /* make sure we didn't drop or screw up locks: */ -+ bch2_trans_verify_locks(trans); -+ -+ if (ret) -+ goto err; -+ -+ trace_and_count(c, transaction_commit, trans, _RET_IP_); -+out: -+ bch2_journal_preres_put(&c->journal, &trans->journal_preres); -+ -+ if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) -+ bch2_write_ref_put(c, BCH_WRITE_REF_trans); -+out_reset: -+ bch2_trans_reset_updates(trans); -+ -+ return ret; -+err: -+ ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); -+ if (ret) -+ goto out; -+ -+ goto retry; -+} -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -new file mode 100644 -index 000000000..71ad3893e ---- /dev/null -+++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,746 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_TYPES_H -+#define _BCACHEFS_BTREE_TYPES_H -+ -+#include -+#include -+ -+//#include "bkey_methods.h" -+#include "buckets_types.h" -+#include "darray.h" -+#include "errcode.h" -+#include "journal_types.h" -+#include "replicas_types.h" -+#include "six.h" -+ -+struct open_bucket; -+struct btree_update; -+struct btree_trans; -+ -+#define MAX_BSETS 3U -+ -+struct btree_nr_keys { -+ -+ /* -+ * Amount of live metadata (i.e. size of node after a compaction) in -+ * units of u64s -+ */ -+ u16 live_u64s; -+ u16 bset_u64s[MAX_BSETS]; -+ -+ /* live keys only: */ -+ u16 packed_keys; -+ u16 unpacked_keys; -+}; -+ -+struct bset_tree { -+ /* -+ * We construct a binary tree in an array as if the array -+ * started at 1, so that things line up on the same cachelines -+ * better: see comments in bset.c at cacheline_to_bkey() for -+ * details -+ */ -+ -+ /* size of the binary tree and prev array */ -+ u16 size; -+ -+ /* function of size - precalculated for to_inorder() */ -+ u16 extra; -+ -+ u16 data_offset; -+ u16 aux_data_offset; -+ u16 end_offset; -+}; -+ -+struct btree_write { -+ struct journal_entry_pin journal; -+}; -+ -+struct btree_alloc { -+ struct open_buckets ob; -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); -+}; -+ -+struct btree_bkey_cached_common { -+ struct six_lock lock; -+ u8 level; -+ u8 btree_id; -+ bool cached; -+}; -+ -+struct btree { -+ struct btree_bkey_cached_common c; -+ -+ struct rhash_head hash; -+ u64 hash_val; -+ -+ unsigned long flags; -+ u16 written; -+ u8 nsets; -+ u8 nr_key_bits; -+ u16 version_ondisk; -+ -+ struct bkey_format format; -+ -+ struct btree_node *data; -+ void *aux_data; -+ -+ /* -+ * Sets of sorted keys - the real btree node - plus a binary search tree -+ * -+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point -+ * to the memory we have allocated for this btree node. Additionally, -+ * set[0]->data points to the entire btree node as it exists on disk. -+ */ -+ struct bset_tree set[MAX_BSETS]; -+ -+ struct btree_nr_keys nr; -+ u16 sib_u64s[2]; -+ u16 whiteout_u64s; -+ u8 byte_order; -+ u8 unpack_fn_len; -+ -+ struct btree_write writes[2]; -+ -+ /* Key/pointer for this btree node */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ -+ /* -+ * XXX: add a delete sequence number, so when bch2_btree_node_relock() -+ * fails because the lock sequence number has changed - i.e. the -+ * contents were modified - we can still relock the node if it's still -+ * the one we want, without redoing the traversal -+ */ -+ -+ /* -+ * For asynchronous splits/interior node updates: -+ * When we do a split, we allocate new child nodes and update the parent -+ * node to point to them: we update the parent in memory immediately, -+ * but then we must wait until the children have been written out before -+ * the update to the parent can be written - this is a list of the -+ * btree_updates that are blocking this node from being -+ * written: -+ */ -+ struct list_head write_blocked; -+ -+ /* -+ * Also for asynchronous splits/interior node updates: -+ * If a btree node isn't reachable yet, we don't want to kick off -+ * another write - because that write also won't yet be reachable and -+ * marking it as completed before it's reachable would be incorrect: -+ */ -+ unsigned long will_make_reachable; -+ -+ struct open_buckets ob; -+ -+ /* lru list */ -+ struct list_head list; -+}; -+ -+struct btree_cache { -+ struct rhashtable table; -+ bool table_init_done; -+ /* -+ * We never free a struct btree, except on shutdown - we just put it on -+ * the btree_cache_freed list and reuse it later. This simplifies the -+ * code, and it doesn't cost us much memory as the memory usage is -+ * dominated by buffers that hold the actual btree node data and those -+ * can be freed - and the number of struct btrees allocated is -+ * effectively bounded. -+ * -+ * btree_cache_freeable effectively is a small cache - we use it because -+ * high order page allocations can be rather expensive, and it's quite -+ * common to delete and allocate btree nodes in quick succession. It -+ * should never grow past ~2-3 nodes in practice. -+ */ -+ struct mutex lock; -+ struct list_head live; -+ struct list_head freeable; -+ struct list_head freed_pcpu; -+ struct list_head freed_nonpcpu; -+ -+ /* Number of elements in live + freeable lists */ -+ unsigned used; -+ unsigned reserve; -+ unsigned freed; -+ unsigned not_freed_lock_intent; -+ unsigned not_freed_lock_write; -+ unsigned not_freed_dirty; -+ unsigned not_freed_read_in_flight; -+ unsigned not_freed_write_in_flight; -+ unsigned not_freed_noevict; -+ unsigned not_freed_write_blocked; -+ unsigned not_freed_will_make_reachable; -+ unsigned not_freed_access_bit; -+ atomic_t dirty; -+ struct shrinker shrink; -+ -+ /* -+ * If we need to allocate memory for a new btree node and that -+ * allocation fails, we can cannibalize another node in the btree cache -+ * to satisfy the allocation - lock to guarantee only one thread does -+ * this at a time: -+ */ -+ struct task_struct *alloc_lock; -+ struct closure_waitlist alloc_wait; -+}; -+ -+struct btree_node_iter { -+ struct btree_node_iter_set { -+ u16 k, end; -+ } data[MAX_BSETS]; -+}; -+ -+/* -+ * Iterate over all possible positions, synthesizing deleted keys for holes: -+ */ -+static const u16 BTREE_ITER_SLOTS = 1 << 0; -+static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; -+/* -+ * Indicates that intent locks should be taken on leaf nodes, because we expect -+ * to be doing updates: -+ */ -+static const u16 BTREE_ITER_INTENT = 1 << 2; -+/* -+ * Causes the btree iterator code to prefetch additional btree nodes from disk: -+ */ -+static const u16 BTREE_ITER_PREFETCH = 1 << 3; -+/* -+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for -+ * @pos or the first key strictly greater than @pos -+ */ -+static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; -+static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; -+static const u16 BTREE_ITER_CACHED = 1 << 6; -+static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; -+static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; -+static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; -+static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; -+static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; -+static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; -+static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; -+static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; -+static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; -+#define __BTREE_ITER_FLAGS_END 16 -+ -+enum btree_path_uptodate { -+ BTREE_ITER_UPTODATE = 0, -+ BTREE_ITER_NEED_RELOCK = 1, -+ BTREE_ITER_NEED_TRAVERSE = 2, -+}; -+ -+#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) -+#define TRACK_PATH_ALLOCATED -+#endif -+ -+struct btree_path { -+ u8 idx; -+ u8 sorted_idx; -+ u8 ref; -+ u8 intent_ref; -+ -+ /* btree_iter_copy starts here: */ -+ struct bpos pos; -+ -+ enum btree_id btree_id:5; -+ bool cached:1; -+ bool preserve:1; -+ enum btree_path_uptodate uptodate:2; -+ /* -+ * When true, failing to relock this path will cause the transaction to -+ * restart: -+ */ -+ bool should_be_locked:1; -+ unsigned level:3, -+ locks_want:3; -+ u8 nodes_locked; -+ -+ struct btree_path_level { -+ struct btree *b; -+ struct btree_node_iter iter; -+ u32 lock_seq; -+#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS -+ u64 lock_taken_time; -+#endif -+ } l[BTREE_MAX_DEPTH]; -+#ifdef TRACK_PATH_ALLOCATED -+ unsigned long ip_allocated; -+#endif -+}; -+ -+static inline struct btree_path_level *path_l(struct btree_path *path) -+{ -+ return path->l + path->level; -+} -+ -+static inline unsigned long btree_path_ip_allocated(struct btree_path *path) -+{ -+#ifdef TRACK_PATH_ALLOCATED -+ return path->ip_allocated; -+#else -+ return _THIS_IP_; -+#endif -+} -+ -+/* -+ * @pos - iterator's current position -+ * @level - current btree depth -+ * @locks_want - btree level below which we start taking intent locks -+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked -+ * @nodes_intent_locked - bitmask indicating which locks are intent locks -+ */ -+struct btree_iter { -+ struct btree_trans *trans; -+ struct btree_path *path; -+ struct btree_path *update_path; -+ struct btree_path *key_cache_path; -+ -+ enum btree_id btree_id:8; -+ unsigned min_depth:3; -+ unsigned advanced:1; -+ -+ /* btree_iter_copy starts here: */ -+ u16 flags; -+ -+ /* When we're filtering by snapshot, the snapshot ID we're looking for: */ -+ unsigned snapshot; -+ -+ struct bpos pos; -+ /* -+ * Current unpacked key - so that bch2_btree_iter_next()/ -+ * bch2_btree_iter_next_slot() can correctly advance pos. -+ */ -+ struct bkey k; -+ -+ /* BTREE_ITER_WITH_JOURNAL: */ -+ size_t journal_idx; -+ struct bpos journal_pos; -+#ifdef TRACK_PATH_ALLOCATED -+ unsigned long ip_allocated; -+#endif -+}; -+ -+struct btree_key_cache_freelist { -+ struct bkey_cached *objs[16]; -+ unsigned nr; -+}; -+ -+struct btree_key_cache { -+ struct mutex lock; -+ struct rhashtable table; -+ bool table_init_done; -+ struct list_head freed_pcpu; -+ struct list_head freed_nonpcpu; -+ struct shrinker shrink; -+ unsigned shrink_iter; -+ struct btree_key_cache_freelist __percpu *pcpu_freed; -+ -+ atomic_long_t nr_freed; -+ atomic_long_t nr_keys; -+ atomic_long_t nr_dirty; -+}; -+ -+struct bkey_cached_key { -+ u32 btree_id; -+ struct bpos pos; -+} __packed __aligned(4); -+ -+#define BKEY_CACHED_ACCESSED 0 -+#define BKEY_CACHED_DIRTY 1 -+ -+struct bkey_cached { -+ struct btree_bkey_cached_common c; -+ -+ unsigned long flags; -+ u16 u64s; -+ bool valid; -+ u32 btree_trans_barrier_seq; -+ struct bkey_cached_key key; -+ -+ struct rhash_head hash; -+ struct list_head list; -+ -+ struct journal_preres res; -+ struct journal_entry_pin journal; -+ u64 seq; -+ -+ struct bkey_i *k; -+}; -+ -+static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) -+{ -+ return !b->cached -+ ? container_of(b, struct btree, c)->key.k.p -+ : container_of(b, struct bkey_cached, c)->key.pos; -+} -+ -+struct btree_insert_entry { -+ unsigned flags; -+ u8 bkey_type; -+ enum btree_id btree_id:8; -+ u8 level:4; -+ bool cached:1; -+ bool insert_trigger_run:1; -+ bool overwrite_trigger_run:1; -+ bool key_cache_already_flushed:1; -+ /* -+ * @old_k may be a key from the journal; @old_btree_u64s always refers -+ * to the size of the key being overwritten in the btree: -+ */ -+ u8 old_btree_u64s; -+ struct bkey_i *k; -+ struct btree_path *path; -+ u64 seq; -+ /* key being overwritten: */ -+ struct bkey old_k; -+ const struct bch_val *old_v; -+ unsigned long ip_allocated; -+}; -+ -+#define BTREE_ITER_MAX 64 -+ -+struct btree_trans_commit_hook; -+typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); -+ -+struct btree_trans_commit_hook { -+ btree_trans_commit_hook_fn *fn; -+ struct btree_trans_commit_hook *next; -+}; -+ -+#define BTREE_TRANS_MEM_MAX (1U << 16) -+ -+#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 -+ -+struct btree_trans { -+ struct bch_fs *c; -+ const char *fn; -+ struct closure ref; -+ struct list_head list; -+ u64 last_begin_time; -+ -+ u8 lock_may_not_fail; -+ u8 lock_must_abort; -+ struct btree_bkey_cached_common *locking; -+ struct six_lock_waiter locking_wait; -+ -+ int srcu_idx; -+ -+ u8 fn_idx; -+ u8 nr_sorted; -+ u8 nr_updates; -+ u8 nr_wb_updates; -+ u8 wb_updates_size; -+ bool used_mempool:1; -+ bool in_traverse_all:1; -+ bool paths_sorted:1; -+ bool memory_allocation_failure:1; -+ bool journal_transaction_names:1; -+ bool journal_replay_not_finished:1; -+ bool is_initial_gc:1; -+ bool notrace_relock_fail:1; -+ enum bch_errcode restarted:16; -+ u32 restart_count; -+ unsigned long last_begin_ip; -+ unsigned long last_restarted_ip; -+ unsigned long srcu_lock_time; -+ -+ /* -+ * For when bch2_trans_update notices we'll be splitting a compressed -+ * extent: -+ */ -+ unsigned extra_journal_res; -+ unsigned nr_max_paths; -+ -+ u64 paths_allocated; -+ -+ unsigned mem_top; -+ unsigned mem_max; -+ unsigned mem_bytes; -+ void *mem; -+ -+ u8 sorted[BTREE_ITER_MAX + 8]; -+ struct btree_path *paths; -+ struct btree_insert_entry *updates; -+ struct btree_write_buffered_key *wb_updates; -+ -+ /* update path: */ -+ struct btree_trans_commit_hook *hooks; -+ darray_u64 extra_journal_entries; -+ struct journal_entry_pin *journal_pin; -+ -+ struct journal_res journal_res; -+ struct journal_preres journal_preres; -+ u64 *journal_seq; -+ struct disk_reservation *disk_res; -+ unsigned journal_u64s; -+ unsigned journal_preres_u64s; -+ struct replicas_delta_list *fs_usage_deltas; -+}; -+ -+#define BCH_BTREE_WRITE_TYPES() \ -+ x(initial, 0) \ -+ x(init_next_bset, 1) \ -+ x(cache_reclaim, 2) \ -+ x(journal_reclaim, 3) \ -+ x(interior, 4) -+ -+enum btree_write_type { -+#define x(t, n) BTREE_WRITE_##t, -+ BCH_BTREE_WRITE_TYPES() -+#undef x -+ BTREE_WRITE_TYPE_NR, -+}; -+ -+#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) -+#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) -+ -+#define BTREE_FLAGS() \ -+ x(read_in_flight) \ -+ x(read_error) \ -+ x(dirty) \ -+ x(need_write) \ -+ x(write_blocked) \ -+ x(will_make_reachable) \ -+ x(noevict) \ -+ x(write_idx) \ -+ x(accessed) \ -+ x(write_in_flight) \ -+ x(write_in_flight_inner) \ -+ x(just_written) \ -+ x(dying) \ -+ x(fake) \ -+ x(need_rewrite) \ -+ x(never_write) -+ -+enum btree_flags { -+ /* First bits for btree node write type */ -+ BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, -+#define x(flag) BTREE_NODE_##flag, -+ BTREE_FLAGS() -+#undef x -+}; -+ -+#define x(flag) \ -+static inline bool btree_node_ ## flag(struct btree *b) \ -+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void set_btree_node_ ## flag(struct btree *b) \ -+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void clear_btree_node_ ## flag(struct btree *b) \ -+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } -+ -+BTREE_FLAGS() -+#undef x -+ -+static inline struct btree_write *btree_current_write(struct btree *b) -+{ -+ return b->writes + btree_node_write_idx(b); -+} -+ -+static inline struct btree_write *btree_prev_write(struct btree *b) -+{ -+ return b->writes + (btree_node_write_idx(b) ^ 1); -+} -+ -+static inline struct bset_tree *bset_tree_last(struct btree *b) -+{ -+ EBUG_ON(!b->nsets); -+ return b->set + b->nsets - 1; -+} -+ -+static inline void * -+__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -+{ -+ return (void *) ((u64 *) b->data + 1 + offset); -+} -+ -+static inline u16 -+__btree_node_ptr_to_offset(const struct btree *b, const void *p) -+{ -+ u16 ret = (u64 *) p - 1 - (u64 *) b->data; -+ -+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); -+ return ret; -+} -+ -+static inline struct bset *bset(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return __btree_node_offset_to_ptr(b, t->data_offset); -+} -+ -+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -+{ -+ t->end_offset = -+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -+} -+ -+static inline void set_btree_bset(struct btree *b, struct bset_tree *t, -+ const struct bset *i) -+{ -+ t->data_offset = __btree_node_ptr_to_offset(b, i); -+ set_btree_bset_end(b, t); -+} -+ -+static inline struct bset *btree_bset_first(struct btree *b) -+{ -+ return bset(b, b->set); -+} -+ -+static inline struct bset *btree_bset_last(struct btree *b) -+{ -+ return bset(b, bset_tree_last(b)); -+} -+ -+static inline u16 -+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -+{ -+ return __btree_node_ptr_to_offset(b, k); -+} -+ -+static inline struct bkey_packed * -+__btree_node_offset_to_key(const struct btree *b, u16 k) -+{ -+ return __btree_node_offset_to_ptr(b, k); -+} -+ -+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -+{ -+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -+} -+ -+#define btree_bkey_first(_b, _t) \ -+({ \ -+ EBUG_ON(bset(_b, _t)->start != \ -+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ -+ \ -+ bset(_b, _t)->start; \ -+}) -+ -+#define btree_bkey_last(_b, _t) \ -+({ \ -+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ -+ vstruct_last(bset(_b, _t))); \ -+ \ -+ __btree_node_offset_to_key(_b, (_t)->end_offset); \ -+}) -+ -+static inline unsigned bset_u64s(struct bset_tree *t) -+{ -+ return t->end_offset - t->data_offset - -+ sizeof(struct bset) / sizeof(u64); -+} -+ -+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -+{ -+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -+} -+ -+static inline unsigned bset_byte_offset(struct btree *b, void *i) -+{ -+ return i - (void *) b->data; -+} -+ -+enum btree_node_type { -+#define x(kwd, val, ...) BKEY_TYPE_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BKEY_TYPE_btree, -+}; -+ -+/* Type of a key in btree @id at level @level: */ -+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -+{ -+ return level ? BKEY_TYPE_btree : (enum btree_node_type) id; -+} -+ -+/* Type of keys @b contains: */ -+static inline enum btree_node_type btree_node_type(struct btree *b) -+{ -+ return __btree_node_type(b->c.level, b->c.btree_id); -+} -+ -+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ (BIT(BKEY_TYPE_extents)| \ -+ BIT(BKEY_TYPE_alloc)| \ -+ BIT(BKEY_TYPE_inodes)| \ -+ BIT(BKEY_TYPE_stripes)| \ -+ BIT(BKEY_TYPE_reflink)| \ -+ BIT(BKEY_TYPE_btree)) -+ -+#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ -+ (BIT(BKEY_TYPE_alloc)| \ -+ BIT(BKEY_TYPE_inodes)| \ -+ BIT(BKEY_TYPE_stripes)| \ -+ BIT(BKEY_TYPE_snapshots)) -+ -+#define BTREE_NODE_TYPE_HAS_TRIGGERS \ -+ (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ -+ BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) -+ -+static inline bool btree_node_type_needs_gc(enum btree_node_type type) -+{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); -+} -+ -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ const unsigned mask = 0 -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_EXTENTS)) << nr) -+ BCH_BTREE_IDS() -+#undef x -+ ; -+ -+ return (1U << type) & mask; -+} -+ -+static inline bool btree_id_is_extents(enum btree_id btree) -+{ -+ return btree_node_type_is_extents((enum btree_node_type) btree); -+} -+ -+static inline bool btree_type_has_snapshots(enum btree_id id) -+{ -+ const unsigned mask = 0 -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_SNAPSHOTS)) << nr) -+ BCH_BTREE_IDS() -+#undef x -+ ; -+ -+ return (1U << id) & mask; -+} -+ -+static inline bool btree_type_has_ptrs(enum btree_id id) -+{ -+ const unsigned mask = 0 -+#define x(name, nr, flags, ...) |((!!((flags) & BTREE_ID_DATA)) << nr) -+ BCH_BTREE_IDS() -+#undef x -+ ; -+ -+ return (1U << id) & mask; -+} -+ -+struct btree_root { -+ struct btree *b; -+ -+ /* On disk root - see async splits: */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ u8 level; -+ u8 alive; -+ s8 error; -+}; -+ -+enum btree_gc_coalesce_fail_reason { -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -+}; -+ -+enum btree_node_sibling { -+ btree_prev_sib, -+ btree_next_sib, -+}; -+ -+#endif /* _BCACHEFS_BTREE_TYPES_H */ -diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c -new file mode 100644 -index 000000000..880ce7431 ---- /dev/null -+++ b/fs/bcachefs/btree_update.c -@@ -0,0 +1,898 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_iter.h" -+#include "btree_journal_iter.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "debug.h" -+#include "errcode.h" -+#include "error.h" -+#include "extents.h" -+#include "keylist.h" -+#include "snapshot.h" -+#include "trace.h" -+ -+static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, -+ const struct btree_insert_entry *r) -+{ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->cached, r->cached) ?: -+ -cmp_int(l->level, r->level) ?: -+ bpos_cmp(l->k->k.p, r->k->k.p); -+} -+ -+static int __must_check -+bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, -+ struct bkey_i *, enum btree_update_flags, -+ unsigned long ip); -+ -+static noinline int extent_front_merge(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bkey_i **insert, -+ enum btree_update_flags flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i *update; -+ int ret; -+ -+ update = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ return ret; -+ -+ if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) -+ return 0; -+ -+ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p) ?: -+ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, (*insert)->k.p); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ return 0; -+ -+ ret = bch2_btree_delete_at(trans, iter, flags); -+ if (ret) -+ return ret; -+ -+ *insert = update; -+ return 0; -+} -+ -+static noinline int extent_back_merge(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ ret = bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?: -+ bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ return 0; -+ -+ bch2_bkey_merge(c, bkey_i_to_s(insert), k); -+ return 0; -+} -+ -+/* -+ * When deleting, check if we need to emit a whiteout (because we're overwriting -+ * something in an ancestor snapshot) -+ */ -+static int need_whiteout_for_snapshot(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 snapshot = pos.snapshot; -+ int ret; -+ -+ if (!bch2_snapshot_parent(trans->c, pos.snapshot)) -+ return 0; -+ -+ pos.snapshot++; -+ -+ for_each_btree_key_norestart(trans, iter, btree_id, pos, -+ BTREE_ITER_ALL_SNAPSHOTS| -+ BTREE_ITER_NOPRESERVE, k, ret) { -+ if (!bkey_eq(k.k->p, pos)) -+ break; -+ -+ if (bch2_snapshot_is_ancestor(trans->c, snapshot, -+ k.k->p.snapshot)) { -+ ret = !bkey_whiteout(k.k); -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos old_pos, -+ struct bpos new_pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter old_iter, new_iter = { NULL }; -+ struct bkey_s_c old_k, new_k; -+ snapshot_id_list s; -+ struct bkey_i *update; -+ int ret; -+ -+ if (!bch2_snapshot_has_children(c, old_pos.snapshot)) -+ return 0; -+ -+ darray_init(&s); -+ -+ bch2_trans_iter_init(trans, &old_iter, id, old_pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ while ((old_k = bch2_btree_iter_prev(&old_iter)).k && -+ !(ret = bkey_err(old_k)) && -+ bkey_eq(old_pos, old_k.k->p)) { -+ struct bpos whiteout_pos = -+ SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; -+ -+ if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || -+ snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) -+ continue; -+ -+ new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_INTENT); -+ ret = bkey_err(new_k); -+ if (ret) -+ break; -+ -+ if (new_k.k->type == KEY_TYPE_deleted) { -+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(update); -+ if (ret) -+ break; -+ -+ bkey_init(&update->k); -+ update->k.p = whiteout_pos; -+ update->k.type = KEY_TYPE_whiteout; -+ -+ ret = bch2_trans_update(trans, &new_iter, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ } -+ bch2_trans_iter_exit(trans, &new_iter); -+ -+ ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &new_iter); -+ bch2_trans_iter_exit(trans, &old_iter); -+ darray_exit(&s); -+ -+ return ret; -+} -+ -+int bch2_trans_update_extent_overwrite(struct btree_trans *trans, -+ struct btree_iter *iter, -+ enum btree_update_flags flags, -+ struct bkey_s_c old, -+ struct bkey_s_c new) -+{ -+ enum btree_id btree_id = iter->btree_id; -+ struct bkey_i *update; -+ struct bpos new_start = bkey_start_pos(new.k); -+ bool front_split = bkey_lt(bkey_start_pos(old.k), new_start); -+ bool back_split = bkey_gt(old.k->p, new.k->p); -+ int ret = 0, compressed_sectors; -+ -+ /* -+ * If we're going to be splitting a compressed extent, note it -+ * so that __bch2_trans_commit() can increase our disk -+ * reservation: -+ */ -+ if (((front_split && back_split) || -+ ((front_split || back_split) && old.k->p.snapshot != new.k->p.snapshot)) && -+ (compressed_sectors = bch2_bkey_sectors_compressed(old))) -+ trans->extra_journal_res += compressed_sectors; -+ -+ if (front_split) { -+ update = bch2_bkey_make_mut_noupdate(trans, old); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ return ret; -+ -+ bch2_cut_back(new_start, update); -+ -+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, -+ old.k->p, update->k.p) ?: -+ bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ return ret; -+ } -+ -+ /* If we're overwriting in a different snapshot - middle split: */ -+ if (old.k->p.snapshot != new.k->p.snapshot && -+ (front_split || back_split)) { -+ update = bch2_bkey_make_mut_noupdate(trans, old); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ return ret; -+ -+ bch2_cut_front(new_start, update); -+ bch2_cut_back(new.k->p, update); -+ -+ ret = bch2_insert_snapshot_whiteouts(trans, btree_id, -+ old.k->p, update->k.p) ?: -+ bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ return ret; -+ } -+ -+ if (bkey_le(old.k->p, new.k->p)) { -+ update = bch2_trans_kmalloc(trans, sizeof(*update)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ return ret; -+ -+ bkey_init(&update->k); -+ update->k.p = old.k->p; -+ update->k.p.snapshot = new.k->p.snapshot; -+ -+ if (new.k->p.snapshot != old.k->p.snapshot) { -+ update->k.type = KEY_TYPE_whiteout; -+ } else if (btree_type_has_snapshots(btree_id)) { -+ ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); -+ if (ret < 0) -+ return ret; -+ if (ret) -+ update->k.type = KEY_TYPE_whiteout; -+ } -+ -+ ret = bch2_btree_insert_nonextent(trans, btree_id, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); -+ if (ret) -+ return ret; -+ } -+ -+ if (back_split) { -+ update = bch2_bkey_make_mut_noupdate(trans, old); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ return ret; -+ -+ bch2_cut_front(new.k->p, update); -+ -+ ret = bch2_trans_update_by_path(trans, iter->path, update, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| -+ flags, _RET_IP_); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int bch2_trans_update_extent(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert, -+ enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ enum btree_id btree_id = orig_iter->btree_id; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, &iter, btree_id, bkey_start_pos(&insert->k), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_WITH_UPDATES| -+ BTREE_ITER_NOT_EXTENTS); -+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ if ((ret = bkey_err(k))) -+ goto err; -+ if (!k.k) -+ goto out; -+ -+ if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { -+ if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { -+ ret = extent_front_merge(trans, &iter, k, &insert, flags); -+ if (ret) -+ goto err; -+ } -+ -+ goto next; -+ } -+ -+ while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { -+ bool done = bkey_lt(insert->k.p, k.k->p); -+ -+ ret = bch2_trans_update_extent_overwrite(trans, &iter, flags, k, bkey_i_to_s_c(insert)); -+ if (ret) -+ goto err; -+ -+ if (done) -+ goto out; -+next: -+ bch2_btree_iter_advance(&iter); -+ k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); -+ if ((ret = bkey_err(k))) -+ goto err; -+ if (!k.k) -+ goto out; -+ } -+ -+ if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { -+ ret = extent_back_merge(trans, &iter, insert, k); -+ if (ret) -+ goto err; -+ } -+out: -+ if (!bkey_deleted(&insert->k)) -+ ret = bch2_btree_insert_nonextent(trans, btree_id, insert, flags); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static noinline int flush_new_cached_update(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree_insert_entry *i, -+ enum btree_update_flags flags, -+ unsigned long ip) -+{ -+ struct btree_path *btree_path; -+ struct bkey k; -+ int ret; -+ -+ btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, -+ BTREE_ITER_INTENT, _THIS_IP_); -+ ret = bch2_btree_path_traverse(trans, btree_path, 0); -+ if (ret) -+ goto out; -+ -+ /* -+ * The old key in the insert entry might actually refer to an existing -+ * key in the btree that has been deleted from cache and not yet -+ * flushed. Check for this and skip the flush so we don't run triggers -+ * against a stale key. -+ */ -+ bch2_btree_path_peek_slot_exact(btree_path, &k); -+ if (!bkey_deleted(&k)) -+ goto out; -+ -+ i->key_cache_already_flushed = true; -+ i->flags |= BTREE_TRIGGER_NORUN; -+ -+ btree_path_set_should_be_locked(btree_path); -+ ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip); -+out: -+ bch2_path_put(trans, btree_path, true); -+ return ret; -+} -+ -+static int __must_check -+bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, -+ struct bkey_i *k, enum btree_update_flags flags, -+ unsigned long ip) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i, n; -+ u64 seq = 0; -+ int cmp; -+ -+ EBUG_ON(!path->should_be_locked); -+ EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); -+ EBUG_ON(!bpos_eq(k->k.p, path->pos)); -+ -+ /* -+ * The transaction journal res hasn't been allocated at this point. -+ * That occurs at commit time. Reuse the seq field to pass in the seq -+ * of a prejournaled key. -+ */ -+ if (flags & BTREE_UPDATE_PREJOURNAL) -+ seq = trans->journal_res.seq; -+ -+ n = (struct btree_insert_entry) { -+ .flags = flags, -+ .bkey_type = __btree_node_type(path->level, path->btree_id), -+ .btree_id = path->btree_id, -+ .level = path->level, -+ .cached = path->cached, -+ .path = path, -+ .k = k, -+ .seq = seq, -+ .ip_allocated = ip, -+ }; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ BUG_ON(i != trans->updates && -+ btree_insert_entry_cmp(i - 1, i) >= 0); -+#endif -+ -+ /* -+ * Pending updates are kept sorted: first, find position of new update, -+ * then delete/trim any updates the new update overwrites: -+ */ -+ trans_for_each_update(trans, i) { -+ cmp = btree_insert_entry_cmp(&n, i); -+ if (cmp <= 0) -+ break; -+ } -+ -+ if (!cmp && i < trans->updates + trans->nr_updates) { -+ EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); -+ -+ bch2_path_put(trans, i->path, true); -+ i->flags = n.flags; -+ i->cached = n.cached; -+ i->k = n.k; -+ i->path = n.path; -+ i->seq = n.seq; -+ i->ip_allocated = n.ip_allocated; -+ } else { -+ array_insert_item(trans->updates, trans->nr_updates, -+ i - trans->updates, n); -+ -+ i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; -+ i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; -+ -+ if (unlikely(trans->journal_replay_not_finished)) { -+ struct bkey_i *j_k = -+ bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); -+ -+ if (j_k) { -+ i->old_k = j_k->k; -+ i->old_v = &j_k->v; -+ } -+ } -+ } -+ -+ __btree_path_get(i->path, true); -+ -+ /* -+ * If a key is present in the key cache, it must also exist in the -+ * btree - this is necessary for cache coherency. When iterating over -+ * a btree that's cached in the key cache, the btree iter code checks -+ * the key cache - but the key has to exist in the btree for that to -+ * work: -+ */ -+ if (path->cached && bkey_deleted(&i->old_k)) -+ return flush_new_cached_update(trans, path, i, flags, ip); -+ -+ return 0; -+} -+ -+int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_update_flags flags) -+{ -+ struct btree_path *path = iter->update_path ?: iter->path; -+ struct bkey_cached *ck; -+ int ret; -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return bch2_trans_update_extent(trans, iter, k, flags); -+ -+ if (bkey_deleted(&k->k) && -+ !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && -+ (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { -+ ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); -+ if (unlikely(ret < 0)) -+ return ret; -+ -+ if (ret) -+ k->k.type = KEY_TYPE_whiteout; -+ } -+ -+ /* -+ * Ensure that updates to cached btrees go to the key cache: -+ */ -+ if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && -+ !path->cached && -+ !path->level && -+ btree_id_cached(trans->c, path->btree_id)) { -+ if (!iter->key_cache_path || -+ !iter->key_cache_path->should_be_locked || -+ !bpos_eq(iter->key_cache_path->pos, k->k.p)) { -+ if (!iter->key_cache_path) -+ iter->key_cache_path = -+ bch2_path_get(trans, path->btree_id, path->pos, 1, 0, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_CACHED, _THIS_IP_); -+ -+ iter->key_cache_path = -+ bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, -+ iter->flags & BTREE_ITER_INTENT, -+ _THIS_IP_); -+ -+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, -+ BTREE_ITER_CACHED); -+ if (unlikely(ret)) -+ return ret; -+ -+ ck = (void *) iter->key_cache_path->l[0].b; -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); -+ } -+ -+ btree_path_set_should_be_locked(iter->key_cache_path); -+ } -+ -+ path = iter->key_cache_path; -+ } -+ -+ return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_); -+} -+ -+/* -+ * Add a transaction update for a key that has already been journaled. -+ */ -+int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq, -+ struct btree_iter *iter, struct bkey_i *k, -+ enum btree_update_flags flags) -+{ -+ trans->journal_res.seq = seq; -+ return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL| -+ BTREE_UPDATE_PREJOURNAL); -+} -+ -+int __must_check bch2_trans_update_buffered(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bkey_i *k) -+{ -+ struct btree_write_buffered_key *i; -+ int ret; -+ -+ EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); -+ EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); -+ -+ trans_for_each_wb_update(trans, i) { -+ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { -+ bkey_copy(&i->k, k); -+ return 0; -+ } -+ } -+ -+ if (!trans->wb_updates || -+ trans->nr_wb_updates == trans->wb_updates_size) { -+ struct btree_write_buffered_key *u; -+ -+ if (trans->nr_wb_updates == trans->wb_updates_size) { -+ struct btree_transaction_stats *s = btree_trans_stats(trans); -+ -+ BUG_ON(trans->wb_updates_size > U8_MAX / 2); -+ trans->wb_updates_size = max(1, trans->wb_updates_size * 2); -+ if (s) -+ s->wb_updates_size = trans->wb_updates_size; -+ } -+ -+ u = bch2_trans_kmalloc_nomemzero(trans, -+ trans->wb_updates_size * -+ sizeof(struct btree_write_buffered_key)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ if (trans->nr_wb_updates) -+ memcpy(u, trans->wb_updates, trans->nr_wb_updates * -+ sizeof(struct btree_write_buffered_key)); -+ trans->wb_updates = u; -+ } -+ -+ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { -+ .btree = btree, -+ }; -+ -+ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); -+ trans->nr_wb_updates++; -+ -+ return 0; -+} -+ -+int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, -+ enum btree_id btree, struct bpos end) -+{ -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); -+ k = bch2_btree_iter_prev(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bch2_btree_iter_advance(iter); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ BUG_ON(k.k->type != KEY_TYPE_deleted); -+ -+ if (bkey_gt(k.k->p, end)) { -+ ret = -BCH_ERR_ENOSPC_btree_slot; -+ goto err; -+ } -+ -+ return 0; -+err: -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+} -+ -+void bch2_trans_commit_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *h) -+{ -+ h->next = trans->hooks; -+ trans->hooks = h; -+} -+ -+int bch2_btree_insert_nonextent(struct btree_trans *trans, -+ enum btree_id btree, struct bkey_i *k, -+ enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, btree, k->k.p, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, k, flags); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, -+ struct bkey_i *k, enum btree_update_flags flags) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(trans, &iter, k, flags); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/** -+ * bch2_btree_insert - insert keys into the extent btree -+ * @c: pointer to struct bch_fs -+ * @id: btree to insert into -+ * @insert_keys: list of keys to insert -+ * @hook: insert callback -+ */ -+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, int flags) -+{ -+ return bch2_trans_do(c, disk_res, journal_seq, flags, -+ __bch2_btree_insert(&trans, id, k, 0)); -+} -+ -+int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, -+ unsigned len, unsigned update_flags) -+{ -+ struct bkey_i *k; -+ -+ k = bch2_trans_kmalloc(trans, sizeof(*k)); -+ if (IS_ERR(k)) -+ return PTR_ERR(k); -+ -+ bkey_init(&k->k); -+ k->k.p = iter->pos; -+ bch2_key_resize(&k->k, len); -+ return bch2_trans_update(trans, iter, k, update_flags); -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned update_flags) -+{ -+ return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); -+} -+ -+int bch2_btree_delete_at_buffered(struct btree_trans *trans, -+ enum btree_id btree, struct bpos pos) -+{ -+ struct bkey_i *k; -+ -+ k = bch2_trans_kmalloc(trans, sizeof(*k)); -+ if (IS_ERR(k)) -+ return PTR_ERR(k); -+ -+ bkey_init(&k->k); -+ k->k.p = pos; -+ return bch2_trans_update_buffered(trans, btree, k); -+} -+ -+int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, -+ struct bpos start, struct bpos end, -+ unsigned update_flags, -+ u64 *journal_seq) -+{ -+ u32 restart_count = trans->restart_count; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); -+ while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(trans->c, 0); -+ struct bkey_i delete; -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete.k); -+ -+ /* -+ * This could probably be more efficient for extents: -+ */ -+ -+ /* -+ * For extents, iter.pos won't necessarily be the same as -+ * bkey_start_pos(k.k) (for non extents they always will be the -+ * same). It's important that we delete starting from iter.pos -+ * because the range we want to delete could start in the middle -+ * of k. -+ * -+ * (bch2_btree_iter_peek() does guarantee that iter.pos >= -+ * bkey_start_pos(k.k)). -+ */ -+ delete.k.p = iter.pos; -+ -+ if (iter.flags & BTREE_ITER_IS_EXTENTS) -+ bch2_key_resize(&delete.k, -+ bpos_min(end, k.k->p).offset - -+ iter.pos.offset); -+ -+ ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: -+ bch2_trans_commit(trans, &disk_res, journal_seq, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(trans->c, &disk_res); -+err: -+ /* -+ * the bch2_trans_begin() call is in a weird place because we -+ * need to call it after every transaction commit, to avoid path -+ * overflow, but don't want to call it if the delete operation -+ * is a no-op and we have no work to do: -+ */ -+ bch2_trans_begin(trans); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (!ret && trans_was_restarted(trans, restart_count)) -+ ret = -BCH_ERR_transaction_restart_nested; -+ return ret; -+} -+ -+/* -+ * bch_btree_delete_range - delete everything within a given range -+ * -+ * Range is a half open interval - [start, end) -+ */ -+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, -+ struct bpos start, struct bpos end, -+ unsigned update_flags, -+ u64 *journal_seq) -+{ -+ int ret = bch2_trans_run(c, -+ bch2_btree_delete_range_trans(&trans, id, start, end, -+ update_flags, journal_seq)); -+ if (ret == -BCH_ERR_transaction_restart_nested) -+ ret = 0; -+ return ret; -+} -+ -+int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, -+ struct bpos pos, bool set) -+{ -+ struct bkey_i *k; -+ int ret = 0; -+ -+ k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ bkey_init(&k->k); -+ k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; -+ k->k.p = pos; -+ -+ return bch2_trans_update_buffered(trans, btree, k); -+} -+ -+static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) -+{ -+ struct printbuf buf = PRINTBUF; -+ struct jset_entry_log *l; -+ unsigned u64s; -+ int ret; -+ -+ prt_vprintf(&buf, fmt, args); -+ ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; -+ if (ret) -+ goto err; -+ -+ u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); -+ -+ ret = darray_make_room(entries, jset_u64s(u64s)); -+ if (ret) -+ goto err; -+ -+ l = (void *) &darray_top(*entries); -+ l->entry.u64s = cpu_to_le16(u64s); -+ l->entry.btree_id = 0; -+ l->entry.level = 1; -+ l->entry.type = BCH_JSET_ENTRY_log; -+ l->entry.pad[0] = 0; -+ l->entry.pad[1] = 0; -+ l->entry.pad[2] = 0; -+ memcpy(l->d, buf.buf, buf.pos); -+ while (buf.pos & 7) -+ l->d[buf.pos++] = '\0'; -+ -+ entries->nr += jset_u64s(u64s); -+err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int -+__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, -+ va_list args) -+{ -+ int ret; -+ -+ if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { -+ ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); -+ } else { -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW|commit_flags, -+ __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); -+ } -+ -+ return ret; -+} -+ -+int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) -+{ -+ va_list args; -+ int ret; -+ -+ va_start(args, fmt); -+ ret = __bch2_fs_log_msg(c, 0, fmt, args); -+ va_end(args); -+ return ret; -+} -+ -+/* -+ * Use for logging messages during recovery to enable reserved space and avoid -+ * blocking. -+ */ -+int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) -+{ -+ va_list args; -+ int ret; -+ -+ va_start(args, fmt); -+ ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); -+ va_end(args); -+ return ret; -+} -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -new file mode 100644 -index 000000000..901c42b57 ---- /dev/null -+++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,353 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_H -+#define _BCACHEFS_BTREE_UPDATE_H -+ -+#include "btree_iter.h" -+#include "journal.h" -+#include "journal.h" -+ -+struct bch_fs; -+struct btree; -+ -+void bch2_btree_node_prep_for_write(struct btree_trans *, -+ struct btree_path *, struct btree *); -+bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, -+ struct btree *, struct btree_node_iter *, -+ struct bkey_i *); -+ -+int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); -+int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); -+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); -+ -+void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, -+ struct bkey_i *, u64); -+ -+enum btree_insert_flags { -+ /* First bits for bch_watermark: */ -+ __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, -+ __BTREE_INSERT_NOCHECK_RW, -+ __BTREE_INSERT_LAZY_RW, -+ __BTREE_INSERT_JOURNAL_REPLAY, -+ __BTREE_INSERT_JOURNAL_RECLAIM, -+ __BTREE_INSERT_NOWAIT, -+ __BTREE_INSERT_GC_LOCK_HELD, -+ __BCH_HASH_SET_MUST_CREATE, -+ __BCH_HASH_SET_MUST_REPLACE, -+}; -+ -+/* Don't check for -ENOSPC: */ -+#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) -+ -+#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) -+#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) -+ -+/* Insert is for journal replay - don't get journal reservations: */ -+#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) -+ -+/* Insert is being called from journal reclaim path: */ -+#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) -+ -+/* Don't block on allocation failure (for new btree nodes: */ -+#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) -+#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) -+ -+#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) -+#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) -+ -+int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, -+ unsigned, unsigned); -+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -+int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); -+ -+int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, -+ struct bkey_i *, enum btree_update_flags); -+ -+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *, -+ enum btree_update_flags); -+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, -+ struct disk_reservation *, u64 *, int flags); -+ -+int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, -+ struct bpos, struct bpos, unsigned, u64 *); -+int bch2_btree_delete_range(struct bch_fs *, enum btree_id, -+ struct bpos, struct bpos, unsigned, u64 *); -+ -+int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); -+ -+int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, -+ struct bpos, struct bpos); -+ -+/* -+ * For use when splitting extents in existing snapshots: -+ * -+ * If @old_pos is an interior snapshot node, iterate over descendent snapshot -+ * nodes: for every descendent snapshot in whiche @old_pos is overwritten and -+ * not visible, emit a whiteout at @new_pos. -+ */ -+static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bpos old_pos, -+ struct bpos new_pos) -+{ -+ if (!btree_type_has_snapshots(btree) || -+ bkey_eq(old_pos, new_pos)) -+ return 0; -+ -+ return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); -+} -+ -+int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, -+ enum btree_update_flags, -+ struct bkey_s_c, struct bkey_s_c); -+ -+int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, -+ enum btree_id, struct bpos); -+ -+int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_update_flags); -+int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *, -+ struct bkey_i *, enum btree_update_flags); -+int __must_check bch2_trans_update_buffered(struct btree_trans *, -+ enum btree_id, struct bkey_i *); -+ -+void bch2_trans_commit_hook(struct btree_trans *, -+ struct btree_trans_commit_hook *); -+int __bch2_trans_commit(struct btree_trans *, unsigned); -+ -+int bch2_fs_log_msg(struct bch_fs *, const char *, ...); -+int bch2_journal_log_msg(struct bch_fs *, const char *, ...); -+ -+/** -+ * bch2_trans_commit - insert keys at given iterator positions -+ * -+ * This is main entry point for btree updates. -+ * -+ * Return values: -+ * -EROFS: filesystem read only -+ * -EIO: journal or btree node IO error -+ */ -+static inline int bch2_trans_commit(struct btree_trans *trans, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ unsigned flags) -+{ -+ trans->disk_res = disk_res; -+ trans->journal_seq = journal_seq; -+ -+ return __bch2_trans_commit(trans, flags); -+} -+ -+#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ -+ lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ -+ (_journal_seq), (_flags))) -+ -+#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ -+ nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ -+ (_journal_seq), (_flags))) -+ -+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ struct btree_trans trans; \ -+ int _ret; \ -+ \ -+ bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ -+ bch2_trans_exit(&trans); \ -+ \ -+ _ret; \ -+}) -+ -+#define bch2_trans_run(_c, _do) \ -+({ \ -+ struct btree_trans trans; \ -+ int _ret; \ -+ \ -+ bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = (_do); \ -+ bch2_trans_exit(&trans); \ -+ \ -+ _ret; \ -+}) -+ -+#define trans_for_each_update(_trans, _i) \ -+ for ((_i) = (_trans)->updates; \ -+ (_i) < (_trans)->updates + (_trans)->nr_updates; \ -+ (_i)++) -+ -+#define trans_for_each_wb_update(_trans, _i) \ -+ for ((_i) = (_trans)->wb_updates; \ -+ (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ -+ (_i)++) -+ -+static inline void bch2_trans_reset_updates(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ bch2_path_put(trans, i->path, true); -+ -+ trans->extra_journal_res = 0; -+ trans->nr_updates = 0; -+ trans->nr_wb_updates = 0; -+ trans->wb_updates = NULL; -+ trans->hooks = NULL; -+ trans->extra_journal_entries.nr = 0; -+ -+ if (trans->fs_usage_deltas) { -+ trans->fs_usage_deltas->used = 0; -+ memset((void *) trans->fs_usage_deltas + -+ offsetof(struct replicas_delta_list, memset_start), 0, -+ (void *) &trans->fs_usage_deltas->memset_end - -+ (void *) &trans->fs_usage_deltas->memset_start); -+ } -+} -+ -+static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, -+ unsigned type, unsigned min_bytes) -+{ -+ unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); -+ struct bkey_i *mut; -+ -+ if (type && k.k->type != type) -+ return ERR_PTR(-ENOENT); -+ -+ mut = bch2_trans_kmalloc_nomemzero(trans, bytes); -+ if (!IS_ERR(mut)) { -+ bkey_reassemble(mut, k); -+ -+ if (unlikely(bytes > bkey_bytes(k.k))) { -+ memset((void *) mut + bkey_bytes(k.k), 0, -+ bytes - bkey_bytes(k.k)); -+ mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); -+ } -+ } -+ return mut; -+} -+ -+static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); -+} -+ -+#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ -+ bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ -+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) -+ -+static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c *k, unsigned flags, -+ unsigned type, unsigned min_bytes) -+{ -+ struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); -+ int ret; -+ -+ if (IS_ERR(mut)) -+ return mut; -+ -+ ret = bch2_trans_update(trans, iter, mut, flags); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ *k = bkey_i_to_s_c(mut); -+ return mut; -+} -+ -+static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c *k, unsigned flags) -+{ -+ return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); -+} -+ -+#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ -+ bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ -+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) -+ -+static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags, unsigned type, unsigned min_bytes) -+{ -+ struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, -+ btree_id, pos, flags|BTREE_ITER_INTENT, type); -+ struct bkey_i *ret = IS_ERR(k.k) -+ ? ERR_CAST(k.k) -+ : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); -+ if (IS_ERR(ret)) -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+} -+ -+static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); -+} -+ -+static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags, unsigned type, unsigned min_bytes) -+{ -+ struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, -+ btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); -+ int ret; -+ -+ if (IS_ERR(mut)) -+ return mut; -+ -+ ret = bch2_trans_update(trans, iter, mut, flags); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return ERR_PTR(ret); -+ } -+ -+ return mut; -+} -+ -+static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags, unsigned min_bytes) -+{ -+ return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); -+} -+ -+static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); -+} -+ -+#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ -+ bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ -+ _btree_id, _pos, _flags, \ -+ KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) -+ -+static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, -+ unsigned flags, unsigned type, unsigned val_size) -+{ -+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); -+ int ret; -+ -+ if (IS_ERR(k)) -+ return k; -+ -+ bkey_init(&k->k); -+ k->k.p = iter->pos; -+ k->k.type = type; -+ set_bkey_val_bytes(&k->k, val_size); -+ -+ ret = bch2_trans_update(trans, iter, k, flags); -+ if (unlikely(ret)) -+ return ERR_PTR(ret); -+ return k; -+} -+ -+#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ -+ bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ -+ KEY_TYPE_##_type, sizeof(struct bch_##_type))) -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_H */ -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -new file mode 100644 -index 000000000..c741150e6 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2488 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_journal_iter.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+ -+static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, -+ struct btree_path *, struct btree *, -+ struct keylist *, unsigned); -+static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -+ -+static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, -+ enum btree_id btree_id, -+ unsigned level, -+ struct bpos pos) -+{ -+ struct btree_path *path; -+ -+ path = bch2_path_get(trans, btree_id, pos, level + 1, level, -+ BTREE_ITER_NOPRESERVE| -+ BTREE_ITER_INTENT, _RET_IP_); -+ path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); -+ bch2_btree_path_downgrade(trans, path); -+ __bch2_btree_path_unlock(trans, path); -+ return path; -+} -+ -+/* Debug code: */ -+ -+/* -+ * Verify that child nodes correctly span parent node's range: -+ */ -+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos next_node = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_btree_ptr_v2 bp; -+ struct bkey unpacked; -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ -+ BUG_ON(!b->c.level); -+ -+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) -+ return; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while (1) { -+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ break; -+ bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ if (!bpos_eq(next_node, bp.v->min_key)) { -+ bch2_dump_btree_node(c, b); -+ bch2_bpos_to_text(&buf1, next_node); -+ bch2_bpos_to_text(&buf2, bp.v->min_key); -+ panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); -+ } -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (bch2_btree_node_iter_end(&iter)) { -+ if (!bpos_eq(k.k->p, b->key.k.p)) { -+ bch2_dump_btree_node(c, b); -+ bch2_bpos_to_text(&buf1, b->key.k.p); -+ bch2_bpos_to_text(&buf2, k.k->p); -+ panic("expected end %s got %s\n", buf1.buf, buf2.buf); -+ } -+ break; -+ } -+ -+ next_node = bpos_successor(k.k->p); -+ } -+#endif -+} -+ -+/* Calculate ideal packed bkey format for new btree nodes: */ -+ -+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -+{ -+ struct bkey_packed *k; -+ struct bset_tree *t; -+ struct bkey uk; -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_deleted(k)) { -+ uk = bkey_unpack_key(b, k); -+ bch2_bkey_format_add_key(s, &uk); -+ } -+} -+ -+static struct bkey_format bch2_btree_calc_format(struct btree *b) -+{ -+ struct bkey_format_state s; -+ -+ bch2_bkey_format_init(&s); -+ bch2_bkey_format_add_pos(&s, b->data->min_key); -+ bch2_bkey_format_add_pos(&s, b->data->max_key); -+ __bch2_btree_calc_format(&s, b); -+ -+ return bch2_bkey_format_done(&s); -+} -+ -+static size_t btree_node_u64s_with_format(struct btree *b, -+ struct bkey_format *new_f) -+{ -+ struct bkey_format *old_f = &b->format; -+ -+ /* stupid integer promotion rules */ -+ ssize_t delta = -+ (((int) new_f->key_u64s - old_f->key_u64s) * -+ (int) b->nr.packed_keys) + -+ (((int) new_f->key_u64s - BKEY_U64s) * -+ (int) b->nr.unpacked_keys); -+ -+ BUG_ON(delta + b->nr.live_u64s < 0); -+ -+ return b->nr.live_u64s + delta; -+} -+ -+/** -+ * btree_node_format_fits - check if we could rewrite node with a new format -+ * -+ * This assumes all keys can pack with the new format -- it just checks if -+ * the re-packed keys would fit inside the node itself. -+ */ -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, -+ struct bkey_format *new_f) -+{ -+ size_t u64s = btree_node_u64s_with_format(b, new_f); -+ -+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); -+} -+ -+/* Btree node freeing/allocation: */ -+ -+static void __btree_node_free(struct bch_fs *c, struct btree *b) -+{ -+ trace_and_count(c, btree_node_free, c, b); -+ -+ BUG_ON(btree_node_write_blocked(b)); -+ BUG_ON(btree_node_dirty(b)); -+ BUG_ON(btree_node_need_write(b)); -+ BUG_ON(b == btree_node_root(c, b)); -+ BUG_ON(b->ob.nr); -+ BUG_ON(!list_empty(&b->write_blocked)); -+ BUG_ON(b->will_make_reachable); -+ -+ clear_btree_node_noevict(b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+} -+ -+static void bch2_btree_node_free_inmem(struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned level = b->c.level; -+ -+ bch2_btree_node_lock_write_nofail(trans, path, &b->c); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED); -+ -+ trans_for_each_path(trans, path) -+ if (path->l[level].b == b) { -+ btree_node_unlock(trans, path, level); -+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -+ } -+} -+ -+static void bch2_btree_node_free_never_used(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; -+ struct btree_path *path; -+ unsigned level = b->c.level; -+ -+ BUG_ON(!list_empty(&b->write_blocked)); -+ BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); -+ -+ b->will_make_reachable = 0; -+ closure_put(&as->cl); -+ -+ clear_btree_node_will_make_reachable(b); -+ clear_btree_node_accessed(b); -+ clear_btree_node_dirty_acct(c, b); -+ clear_btree_node_need_write(b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_del_init(&b->list); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ BUG_ON(p->nr >= ARRAY_SIZE(p->b)); -+ p->b[p->nr++] = b; -+ -+ six_unlock_intent(&b->c.lock); -+ -+ trans_for_each_path(trans, path) -+ if (path->l[level].b == b) { -+ btree_node_unlock(trans, path, level); -+ path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); -+ } -+} -+ -+static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, -+ struct disk_reservation *res, -+ struct closure *cl, -+ bool interior_node, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct write_point *wp; -+ struct btree *b; -+ BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+ struct open_buckets ob = { .nr = 0 }; -+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; -+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; -+ unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim -+ ? BTREE_NODE_RESERVE -+ : 0; -+ int ret; -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ if (c->btree_reserve_cache_nr > nr_reserve) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ ob = a->ob; -+ bkey_copy(&tmp.k, &a->k); -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ goto mem_alloc; -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+retry: -+ ret = bch2_alloc_sectors_start_trans(trans, -+ c->opts.metadata_target ?: -+ c->opts.foreground_target, -+ 0, -+ writepoint_ptr(&c->btree_write_point), -+ &devs_have, -+ res->nr_replicas, -+ c->opts.metadata_replicas_required, -+ watermark, 0, cl, &wp); -+ if (unlikely(ret)) -+ return ERR_PTR(ret); -+ -+ if (wp->sectors_free < btree_sectors(c)) { -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ob->sectors_free < btree_sectors(c)) -+ ob->sectors_free = 0; -+ -+ bch2_alloc_sectors_done(c, wp); -+ goto retry; -+ } -+ -+ bkey_btree_ptr_v2_init(&tmp.k); -+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); -+ -+ bch2_open_bucket_get(c, wp, &ob); -+ bch2_alloc_sectors_done(c, wp); -+mem_alloc: -+ b = bch2_btree_node_mem_alloc(trans, interior_node); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ /* we hold cannibalize_lock: */ -+ BUG_ON(IS_ERR(b)); -+ BUG_ON(b->ob.nr); -+ -+ bkey_copy(&b->key, &tmp.k); -+ b->ob = ob; -+ -+ return b; -+} -+ -+static struct btree *bch2_btree_node_alloc(struct btree_update *as, -+ struct btree_trans *trans, -+ unsigned level) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; -+ int ret; -+ -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ BUG_ON(!p->nr); -+ -+ b = p->b[--p->nr]; -+ -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); -+ -+ set_btree_node_accessed(b); -+ set_btree_node_dirty_acct(c, b); -+ set_btree_node_need_write(b); -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ b->c.level = level; -+ b->c.btree_id = as->btree_id; -+ b->version_ondisk = c->sb.version; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ b->data->magic = cpu_to_le64(bset_magic(c)); -+ memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); -+ b->data->flags = 0; -+ SET_BTREE_NODE_ID(b->data, as->btree_id); -+ SET_BTREE_NODE_LEVEL(b->data, level); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); -+ -+ bp->v.mem_ptr = 0; -+ bp->v.seq = b->data->keys.seq; -+ bp->v.sectors_written = 0; -+ } -+ -+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); -+ BUG_ON(ret); -+ -+ trace_and_count(c, btree_node_alloc, c, b); -+ bch2_increment_clock(c, btree_sectors(c), WRITE); -+ return b; -+} -+ -+static void btree_set_min(struct btree *b, struct bpos pos) -+{ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; -+ b->data->min_key = pos; -+} -+ -+static void btree_set_max(struct btree *b, struct bpos pos) -+{ -+ b->key.k.p = pos; -+ b->data->max_key = pos; -+} -+ -+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree *b) -+{ -+ struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); -+ struct bkey_format format = bch2_btree_calc_format(b); -+ -+ /* -+ * The keys might expand with the new format - if they wouldn't fit in -+ * the btree node anymore, use the old format for now: -+ */ -+ if (!bch2_btree_node_format_fits(as->c, b, &format)) -+ format = b->format; -+ -+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); -+ -+ btree_set_min(n, b->data->min_key); -+ btree_set_max(n, b->data->max_key); -+ -+ n->data->format = format; -+ btree_node_set_format(n, format); -+ -+ bch2_btree_sort_into(as->c, n, b); -+ -+ btree_node_reset_sib_u64s(n); -+ return n; -+} -+ -+static struct btree *__btree_root_alloc(struct btree_update *as, -+ struct btree_trans *trans, unsigned level) -+{ -+ struct btree *b = bch2_btree_node_alloc(as, trans, level); -+ -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, SPOS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ -+ btree_node_set_format(b, b->data->format); -+ bch2_btree_build_aux_trees(b); -+ -+ return b; -+} -+ -+static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) -+{ -+ struct bch_fs *c = as->c; -+ struct prealloc_nodes *p; -+ -+ for (p = as->prealloc_nodes; -+ p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); -+ p++) { -+ while (p->nr) { -+ struct btree *b = p->b[--p->nr]; -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ -+ if (c->btree_reserve_cache_nr < -+ ARRAY_SIZE(c->btree_reserve_cache)) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; -+ -+ a->ob = b->ob; -+ b->ob.nr = 0; -+ bkey_copy(&a->k, &b->key); -+ } else { -+ bch2_open_buckets_put(c, &b->ob); -+ } -+ -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); -+ btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ } -+ } -+} -+ -+static int bch2_btree_reserve_get(struct btree_trans *trans, -+ struct btree_update *as, -+ unsigned nr_nodes[2], -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ unsigned interior; -+ int ret = 0; -+ -+ BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); -+ -+ /* -+ * Protects reaping from the btree node cache and using the btree node -+ * open bucket reserve: -+ * -+ * BTREE_INSERT_NOWAIT only applies to btree node allocation, not -+ * blocking on this lock: -+ */ -+ ret = bch2_btree_cache_cannibalize_lock(c, cl); -+ if (ret) -+ return ret; -+ -+ for (interior = 0; interior < 2; interior++) { -+ struct prealloc_nodes *p = as->prealloc_nodes + interior; -+ -+ while (p->nr < nr_nodes[interior]) { -+ b = __bch2_btree_node_alloc(trans, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT ? NULL : cl, -+ interior, flags); -+ if (IS_ERR(b)) { -+ ret = PTR_ERR(b); -+ goto err; -+ } -+ -+ p->b[p->nr++] = b; -+ } -+ } -+err: -+ bch2_btree_cache_cannibalize_unlock(c); -+ return ret; -+} -+ -+/* Asynchronous interior node update machinery */ -+ -+static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) -+{ -+ struct bch_fs *c = as->c; -+ -+ if (as->took_gc_lock) -+ up_read(&c->gc_lock); -+ as->took_gc_lock = false; -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ bch2_journal_pin_flush(&c->journal, &as->journal); -+ bch2_disk_reservation_put(c, &as->disk_res); -+ bch2_btree_reserve_put(as, trans); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], -+ as->start_time); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_del(&as->unwritten_list); -+ list_del(&as->list); -+ -+ closure_debug_destroy(&as->cl); -+ mempool_free(as, &c->btree_interior_update_pool); -+ -+ /* -+ * Have to do the wakeup with btree_interior_update_lock still held, -+ * since being on btree_interior_update_list is our ref on @c: -+ */ -+ closure_wake_up(&c->btree_interior_update_wait); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static void btree_update_add_key(struct btree_update *as, -+ struct keylist *keys, struct btree *b) -+{ -+ struct bkey_i *k = &b->key; -+ -+ BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > -+ ARRAY_SIZE(as->_old_keys)); -+ -+ bkey_copy(keys->top, k); -+ bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; -+ -+ bch2_keylist_push(keys); -+} -+ -+/* -+ * The transactional part of an interior btree node update, where we journal the -+ * update we did to the interior node and update alloc info: -+ */ -+static int btree_update_nodes_written_trans(struct btree_trans *trans, -+ struct btree_update *as) -+{ -+ struct bkey_i *k; -+ int ret; -+ -+ ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); -+ if (ret) -+ return ret; -+ -+ memcpy(&darray_top(trans->extra_journal_entries), -+ as->journal_entries, -+ as->journal_u64s * sizeof(u64)); -+ trans->extra_journal_entries.nr += as->journal_u64s; -+ -+ trans->journal_pin = &as->journal; -+ -+ for_each_keylist_key(&as->old_keys, k) { -+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; -+ -+ ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_keylist_key(&as->new_keys, k) { -+ unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; -+ -+ ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void btree_update_nodes_written(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ struct btree_trans trans; -+ u64 journal_seq = 0; -+ unsigned i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 512); -+ /* -+ * If we're already in an error state, it might be because a btree node -+ * was never written, and we might be trying to free that same btree -+ * node here, but it won't have been marked as allocated and we'll see -+ * spurious disk usage inconsistencies in the transactional part below -+ * if we don't skip it: -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ /* -+ * Wait for any in flight writes to finish before we free the old nodes -+ * on disk: -+ */ -+ for (i = 0; i < as->nr_old_nodes; i++) { -+ __le64 seq; -+ -+ b = as->old_nodes[i]; -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ seq = b->data ? b->data->keys.seq : 0; -+ six_unlock_read(&b->c.lock); -+ -+ if (seq == as->old_nodes_seq[i]) -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, -+ TASK_UNINTERRUPTIBLE); -+ } -+ -+ /* -+ * We did an update to a parent node where the pointers we added pointed -+ * to child nodes that weren't written yet: now, the child nodes have -+ * been written so we can write out the update to the interior node. -+ */ -+ -+ /* -+ * We can't call into journal reclaim here: we'd block on the journal -+ * reclaim lock, but we may need to release the open buckets we have -+ * pinned in order for other btree updates to make forward progress, and -+ * journal reclaim does btree updates when flushing bkey_cached entries, -+ * which may require allocations as well. -+ */ -+ ret = commit_do(&trans, &as->disk_res, &journal_seq, -+ BCH_WATERMARK_reclaim| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_JOURNAL_RECLAIM, -+ btree_update_nodes_written_trans(&trans, as)); -+ bch2_trans_unlock(&trans); -+ -+ bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, -+ "%s(): error %s", __func__, bch2_err_str(ret)); -+err: -+ if (as->b) { -+ struct btree_path *path; -+ -+ b = as->b; -+ path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); -+ /* -+ * @b is the node we did the final insert into: -+ * -+ * On failure to get a journal reservation, we still have to -+ * unblock the write and allow most of the write path to happen -+ * so that shutdown works, but the i->journal_seq mechanism -+ * won't work to prevent the btree write from being visible (we -+ * didn't get a journal sequence number) - instead -+ * __bch2_btree_node_write() doesn't do the actual write if -+ * we're in journal error state: -+ */ -+ -+ /* -+ * Ensure transaction is unlocked before using -+ * btree_node_lock_nopath() (the use of which is always suspect, -+ * we need to work on removing this in the future) -+ * -+ * It should be, but get_unlocked_mut_path() -> bch2_path_get() -+ * calls bch2_path_upgrade(), before we call path_make_mut(), so -+ * we may rarely end up with a locked path besides the one we -+ * have here: -+ */ -+ bch2_trans_unlock(&trans); -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); -+ mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); -+ path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); -+ path->l[b->c.level].b = b; -+ -+ bch2_btree_node_lock_write_nofail(&trans, path, &b->c); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ list_del(&as->write_blocked_list); -+ if (list_empty(&b->write_blocked)) -+ clear_btree_node_write_blocked(b); -+ -+ /* -+ * Node might have been freed, recheck under -+ * btree_interior_update_lock: -+ */ -+ if (as->b == b) { -+ struct bset *i = btree_bset_last(b); -+ -+ BUG_ON(!b->c.level); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ if (!ret) { -+ i->journal_seq = cpu_to_le64( -+ max(journal_seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ } else { -+ /* -+ * If we didn't get a journal sequence number we -+ * can't write this btree node, because recovery -+ * won't know to ignore this write: -+ */ -+ set_btree_node_never_write(b); -+ } -+ } -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED); -+ six_unlock_write(&b->c.lock); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_intent); -+ btree_node_unlock(&trans, path, b->c.level); -+ bch2_path_put(&trans, path, true); -+ } -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ BUG_ON(b->will_make_reachable != (unsigned long) as); -+ b->will_make_reachable = 0; -+ clear_btree_node_will_make_reachable(b); -+ } -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ } -+ -+ for (i = 0; i < as->nr_open_buckets; i++) -+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); -+ -+ bch2_btree_update_free(as, &trans); -+ bch2_trans_exit(&trans); -+} -+ -+static void btree_interior_update_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, btree_interior_update_work); -+ struct btree_update *as; -+ -+ while (1) { -+ mutex_lock(&c->btree_interior_update_lock); -+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, -+ struct btree_update, unwritten_list); -+ if (as && !as->nodes_written) -+ as = NULL; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (!as) -+ break; -+ -+ btree_update_nodes_written(as); -+ } -+} -+ -+static void btree_update_set_nodes_written(struct closure *cl) -+{ -+ struct btree_update *as = container_of(cl, struct btree_update, cl); -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ as->nodes_written = true; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -+} -+ -+/* -+ * We're updating @b with pointers to nodes that haven't finished writing yet: -+ * block @b from being written until @as completes -+ */ -+static void btree_update_updated_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ BUG_ON(!btree_node_dirty(b)); -+ BUG_ON(!b->c.level); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_NODE; -+ as->b = b; -+ -+ set_btree_node_write_blocked(b); -+ list_add(&as->write_blocked_list, &b->write_blocked); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static void btree_update_reparent(struct btree_update *as, -+ struct btree_update *child) -+{ -+ struct bch_fs *c = as->c; -+ -+ lockdep_assert_held(&c->btree_interior_update_lock); -+ -+ child->b = NULL; -+ child->mode = BTREE_INTERIOR_UPDATING_AS; -+ -+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); -+} -+ -+static void btree_update_updated_root(struct btree_update *as, struct btree *b) -+{ -+ struct bkey_i *insert = &b->key; -+ struct bch_fs *c = as->c; -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_ROOT; -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+/* -+ * bch2_btree_update_add_new_node: -+ * -+ * This causes @as to wait on @b to be written, before it gets to -+ * bch2_btree_update_nodes_written -+ * -+ * Additionally, it sets b->will_make_reachable to prevent any additional writes -+ * to @b from happening besides the first until @b is reachable on disk -+ * -+ * And it adds @b to the list of @as's new nodes, so that we can update sector -+ * counts in bch2_btree_update_nodes_written: -+ */ -+static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ closure_get(&as->cl); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); -+ BUG_ON(b->will_make_reachable); -+ -+ as->new_nodes[as->nr_new_nodes++] = b; -+ b->will_make_reachable = 1UL|(unsigned long) as; -+ set_btree_node_will_make_reachable(b); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ btree_update_add_key(as, &as->new_keys, b); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; -+ unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; -+ -+ bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = -+ cpu_to_le16(sectors); -+ } -+} -+ -+/* -+ * returns true if @b was a new node -+ */ -+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_update *as; -+ unsigned long v; -+ unsigned i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ /* -+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's -+ * dropped when it gets written by bch2_btree_complete_write - the -+ * xchg() is for synchronization with bch2_btree_complete_write: -+ */ -+ v = xchg(&b->will_make_reachable, 0); -+ clear_btree_node_will_make_reachable(b); -+ as = (struct btree_update *) (v & ~1UL); -+ -+ if (!as) { -+ mutex_unlock(&c->btree_interior_update_lock); -+ return; -+ } -+ -+ for (i = 0; i < as->nr_new_nodes; i++) -+ if (as->new_nodes[i] == b) -+ goto found; -+ -+ BUG(); -+found: -+ array_remove_item(as->new_nodes, as->nr_new_nodes, i); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (v & 1) -+ closure_put(&as->cl); -+} -+ -+static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -+{ -+ while (b->ob.nr) -+ as->open_buckets[as->nr_open_buckets++] = -+ b->ob.v[--b->ob.nr]; -+} -+ -+/* -+ * @b is being split/rewritten: it may have pointers to not-yet-written btree -+ * nodes and thus outstanding btree_updates - redirect @b's -+ * btree_updates to point to this btree_update: -+ */ -+static void bch2_btree_interior_update_will_free_node(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct btree_update *p, *n; -+ struct btree_write *w; -+ -+ set_btree_node_dying(b); -+ -+ if (btree_node_fake(b)) -+ return; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ /* -+ * Does this node have any btree_update operations preventing -+ * it from being written? -+ * -+ * If so, redirect them to point to this btree_update: we can -+ * write out our new nodes, but we won't make them visible until those -+ * operations complete -+ */ -+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { -+ list_del_init(&p->write_blocked_list); -+ btree_update_reparent(as, p); -+ -+ /* -+ * for flush_held_btree_writes() waiting on updates to flush or -+ * nodes to be writeable: -+ */ -+ closure_wake_up(&c->btree_interior_update_wait); -+ } -+ -+ clear_btree_node_dirty_acct(c, b); -+ clear_btree_node_need_write(b); -+ clear_btree_node_write_blocked(b); -+ -+ /* -+ * Does this node have unwritten data that has a pin on the journal? -+ * -+ * If so, transfer that pin to the btree_update operation - -+ * note that if we're freeing multiple nodes, we only need to keep the -+ * oldest pin of any of the nodes we're freeing. We'll release the pin -+ * when the new nodes are persistent and reachable on disk: -+ */ -+ w = btree_current_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ w = btree_prev_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ /* -+ * Is this a node that isn't reachable on disk yet? -+ * -+ * Nodes that aren't reachable yet have writes blocked until they're -+ * reachable - now that we've cancelled any pending writes and moved -+ * things waiting on that write to wait on this update, we can drop this -+ * node from the list of nodes that the other update is making -+ * reachable, prior to freeing it: -+ */ -+ btree_update_drop_new_node(c, b); -+ -+ btree_update_add_key(as, &as->old_keys, b); -+ -+ as->old_nodes[as->nr_old_nodes] = b; -+ as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; -+ as->nr_old_nodes++; -+} -+ -+static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) -+{ -+ struct bch_fs *c = as->c; -+ u64 start_time = as->start_time; -+ -+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); -+ -+ if (as->took_gc_lock) -+ up_read(&as->c->gc_lock); -+ as->took_gc_lock = false; -+ -+ bch2_btree_reserve_put(as, trans); -+ -+ continue_at(&as->cl, btree_update_set_nodes_written, -+ as->c->btree_interior_update_worker); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], -+ start_time); -+} -+ -+static struct btree_update * -+bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, -+ unsigned level, bool split, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_update *as; -+ u64 start_time = local_clock(); -+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ unsigned nr_nodes[2] = { 0, 0 }; -+ unsigned update_level = level; -+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; -+ unsigned journal_flags = 0; -+ int ret = 0; -+ u32 restart_count = trans->restart_count; -+ -+ BUG_ON(!path->should_be_locked); -+ -+ if (watermark == BCH_WATERMARK_copygc) -+ watermark = BCH_WATERMARK_btree_copygc; -+ if (watermark < BCH_WATERMARK_btree) -+ watermark = BCH_WATERMARK_btree; -+ -+ flags &= ~BCH_WATERMARK_MASK; -+ flags |= watermark; -+ -+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) -+ journal_flags |= JOURNAL_RES_GET_NONBLOCK; -+ journal_flags |= watermark; -+ -+ while (1) { -+ nr_nodes[!!update_level] += 1 + split; -+ update_level++; -+ -+ ret = bch2_btree_path_upgrade(trans, path, update_level + 1); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (!btree_path_node(path, update_level)) { -+ /* Allocating new root? */ -+ nr_nodes[1] += split; -+ update_level = BTREE_MAX_DEPTH; -+ break; -+ } -+ -+ if (bch2_btree_node_insert_fits(c, path->l[update_level].b, -+ BKEY_BTREE_PTR_U64s_MAX * (1 + split))) -+ break; -+ -+ split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); -+ } -+ -+ if (flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&c->gc_lock); -+ else if (!down_read_trylock(&c->gc_lock)) { -+ ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); -+ if (ret) { -+ up_read(&c->gc_lock); -+ return ERR_PTR(ret); -+ } -+ } -+ -+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); -+ memset(as, 0, sizeof(*as)); -+ closure_init(&as->cl, NULL); -+ as->c = c; -+ as->start_time = start_time; -+ as->mode = BTREE_INTERIOR_NO_UPDATE; -+ as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); -+ as->btree_id = path->btree_id; -+ as->update_level = update_level; -+ INIT_LIST_HEAD(&as->list); -+ INIT_LIST_HEAD(&as->unwritten_list); -+ INIT_LIST_HEAD(&as->write_blocked_list); -+ bch2_keylist_init(&as->old_keys, as->_old_keys); -+ bch2_keylist_init(&as->new_keys, as->_new_keys); -+ bch2_keylist_init(&as->parent_keys, as->inline_keys); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->list, &c->btree_interior_update_list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ /* -+ * We don't want to allocate if we're in an error state, that can cause -+ * deadlock on emergency shutdown due to open buckets getting stuck in -+ * the btree_reserve_cache after allocator shutdown has cleared it out. -+ * This check needs to come after adding us to the btree_interior_update -+ * list but before calling bch2_btree_reserve_get, to synchronize with -+ * __bch2_fs_read_only(). -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags|JOURNAL_RES_GET_NONBLOCK); -+ if (ret) { -+ if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { -+ ret = -BCH_ERR_journal_reclaim_would_deadlock; -+ goto err; -+ } -+ -+ ret = drop_locks_do(trans, -+ bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags)); -+ if (ret == -BCH_ERR_journal_preres_get_blocked) { -+ trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); -+ ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); -+ } -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_disk_reservation_get(c, &as->disk_res, -+ (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), -+ c->opts.metadata_replicas, -+ disk_res_flags); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); -+ if (bch2_err_matches(ret, ENOSPC) || -+ bch2_err_matches(ret, ENOMEM)) { -+ struct closure cl; -+ -+ /* -+ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK -+ * flag -+ */ -+ if (bch2_err_matches(ret, ENOSPC) && -+ (flags & BTREE_INSERT_JOURNAL_RECLAIM) && -+ watermark != BCH_WATERMARK_reclaim) { -+ ret = -BCH_ERR_journal_reclaim_would_deadlock; -+ goto err; -+ } -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); -+ -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); -+ } -+ -+ if (ret) { -+ trace_and_count(c, btree_reserve_get_fail, trans->fn, -+ _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); -+ goto err; -+ } -+ -+ ret = bch2_trans_relock(trans); -+ if (ret) -+ goto err; -+ -+ bch2_trans_verify_not_restarted(trans, restart_count); -+ return as; -+err: -+ bch2_btree_update_free(as, trans); -+ return ERR_PTR(ret); -+} -+ -+/* Btree root updates: */ -+ -+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -+{ -+ /* Root nodes cannot be reaped */ -+ mutex_lock(&c->btree_cache.lock); -+ list_del_init(&b->list); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ mutex_lock(&c->btree_root_lock); -+ BUG_ON(btree_node_root(c, b) && -+ (b->c.level < btree_node_root(c, b)->c.level || -+ !btree_node_dying(btree_node_root(c, b)))); -+ -+ bch2_btree_id_root(c, b->c.btree_id)->b = b; -+ mutex_unlock(&c->btree_root_lock); -+ -+ bch2_recalc_btree_reserve(c); -+} -+ -+/** -+ * bch_btree_set_root - update the root in memory and on disk -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. However, you must hold an intent lock on the -+ * old root. -+ * -+ * Note: This allocates a journal entry but doesn't add any keys to -+ * it. All the btree roots are part of every journal write, so there -+ * is nothing new to be done. This just guarantees that there is a -+ * journal write. -+ */ -+static void bch2_btree_set_root(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *old; -+ -+ trace_and_count(c, btree_node_set_root, c, b); -+ -+ old = btree_node_root(c, b); -+ -+ /* -+ * Ensure no one is using the old root while we switch to the -+ * new root: -+ */ -+ bch2_btree_node_lock_write_nofail(trans, path, &old->c); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ btree_update_updated_root(as, b); -+ -+ /* -+ * Unlock old root after new root is visible: -+ * -+ * The new root isn't persistent, but that's ok: we still have -+ * an intent lock on the new root, and any updates that would -+ * depend on the new root would have to update the new root. -+ */ -+ bch2_btree_node_unlock_write(trans, path, old); -+} -+ -+/* Interior node updates: */ -+ -+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = as->c; -+ struct bkey_packed *k; -+ struct printbuf buf = PRINTBUF; -+ unsigned long old, new, v; -+ -+ BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && -+ !btree_ptr_sectors_written(insert)); -+ -+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) -+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); -+ -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), -+ btree_node_type(b), WRITE, &buf) ?: -+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { -+ printbuf_reset(&buf); -+ prt_printf(&buf, "inserting invalid bkey\n "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); -+ prt_printf(&buf, "\n "); -+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), -+ btree_node_type(b), WRITE, &buf); -+ bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); -+ -+ bch2_fs_inconsistent(c, "%s", buf.buf); -+ dump_stack(); -+ } -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_keys, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && -+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) -+ bch2_btree_node_iter_advance(node_iter, b); -+ -+ bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); -+ set_btree_node_dirty_acct(c, b); -+ -+ v = READ_ONCE(b->flags); -+ do { -+ old = new = v; -+ -+ new &= ~BTREE_WRITE_TYPE_MASK; -+ new |= BTREE_WRITE_interior; -+ new |= 1 << BTREE_NODE_need_write; -+ } while ((v = cmpxchg(&b->flags, old, new)) != old); -+ -+ printbuf_exit(&buf); -+} -+ -+static void -+__bch2_btree_insert_keys_interior(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct btree_node_iter node_iter, -+ struct keylist *keys) -+{ -+ struct bkey_i *insert = bch2_keylist_front(keys); -+ struct bkey_packed *k; -+ -+ BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); -+ -+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && -+ (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) -+ ; -+ -+ while (!bch2_keylist_empty(keys)) { -+ struct bkey_i *k = bch2_keylist_front(keys); -+ -+ if (bpos_gt(k->k.p, b->key.k.p)) -+ break; -+ -+ bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k); -+ bch2_keylist_pop_front(keys); -+ } -+} -+ -+/* -+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher -+ * node) -+ */ -+static void __btree_split_node(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree *b, -+ struct btree *n[2]) -+{ -+ struct bkey_packed *k; -+ struct bpos n1_pos = POS_MIN; -+ struct btree_node_iter iter; -+ struct bset *bsets[2]; -+ struct bkey_format_state format[2]; -+ struct bkey_packed *out[2]; -+ struct bkey uk; -+ unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; -+ int i; -+ -+ for (i = 0; i < 2; i++) { -+ BUG_ON(n[i]->nsets != 1); -+ -+ bsets[i] = btree_bset_first(n[i]); -+ out[i] = bsets[i]->start; -+ -+ SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); -+ bch2_bkey_format_init(&format[i]); -+ } -+ -+ u64s = 0; -+ for_each_btree_node_key(b, k, &iter) { -+ if (bkey_deleted(k)) -+ continue; -+ -+ i = u64s >= n1_u64s; -+ u64s += k->u64s; -+ uk = bkey_unpack_key(b, k); -+ if (!i) -+ n1_pos = uk.p; -+ bch2_bkey_format_add_key(&format[i], &uk); -+ } -+ -+ btree_set_min(n[0], b->data->min_key); -+ btree_set_max(n[0], n1_pos); -+ btree_set_min(n[1], bpos_successor(n1_pos)); -+ btree_set_max(n[1], b->data->max_key); -+ -+ for (i = 0; i < 2; i++) { -+ bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); -+ bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); -+ -+ n[i]->data->format = bch2_bkey_format_done(&format[i]); -+ btree_node_set_format(n[i], n[i]->data->format); -+ } -+ -+ u64s = 0; -+ for_each_btree_node_key(b, k, &iter) { -+ if (bkey_deleted(k)) -+ continue; -+ -+ i = u64s >= n1_u64s; -+ u64s += k->u64s; -+ -+ if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) -+ ? &b->format: &bch2_bkey_format_current, k)) -+ out[i]->format = KEY_FORMAT_LOCAL_BTREE; -+ else -+ bch2_bkey_unpack(b, (void *) out[i], k); -+ -+ out[i]->needs_whiteout = false; -+ -+ btree_keys_account_key_add(&n[i]->nr, 0, out[i]); -+ out[i] = bkey_p_next(out[i]); -+ } -+ -+ for (i = 0; i < 2; i++) { -+ bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); -+ -+ BUG_ON(!bsets[i]->u64s); -+ -+ set_btree_bset_end(n[i], n[i]->set); -+ -+ btree_node_reset_sib_u64s(n[i]); -+ -+ bch2_verify_btree_nr_keys(n[i]); -+ -+ if (b->c.level) -+ btree_node_interior_verify(as->c, n[i]); -+ } -+} -+ -+/* -+ * For updates to interior nodes, we've got to do the insert before we split -+ * because the stuff we're inserting has to be inserted atomically. Post split, -+ * the keys might have to go in different nodes and the split would no longer be -+ * atomic. -+ * -+ * Worse, if the insert is from btree node coalescing, if we do the insert after -+ * we do the split (and pick the pivot) - the pivot we pick might be between -+ * nodes that were coalesced, and thus in the middle of a child node post -+ * coalescing: -+ */ -+static void btree_split_insert_keys(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct keylist *keys) -+{ -+ if (!bch2_keylist_empty(keys) && -+ bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { -+ struct btree_node_iter node_iter; -+ -+ bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); -+ -+ __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); -+ -+ btree_node_interior_verify(as->c, b); -+ } -+} -+ -+static int btree_split(struct btree_update *as, struct btree_trans *trans, -+ struct btree_path *path, struct btree *b, -+ struct keylist *keys, unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *parent = btree_node_parent(path, b); -+ struct btree *n1, *n2 = NULL, *n3 = NULL; -+ struct btree_path *path1 = NULL, *path2 = NULL; -+ u64 start_time = local_clock(); -+ int ret = 0; -+ -+ BUG_ON(!parent && (b != btree_node_root(c, b))); -+ BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { -+ struct btree *n[2]; -+ -+ trace_and_count(c, btree_node_split, c, b); -+ -+ n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); -+ n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); -+ -+ __btree_split_node(as, trans, b, n); -+ -+ if (keys) { -+ btree_split_insert_keys(as, trans, path, n1, keys); -+ btree_split_insert_keys(as, trans, path, n2, keys); -+ BUG_ON(!bch2_keylist_empty(keys)); -+ } -+ -+ bch2_btree_build_aux_trees(n2); -+ bch2_btree_build_aux_trees(n1); -+ -+ bch2_btree_update_add_new_node(as, n1); -+ bch2_btree_update_add_new_node(as, n2); -+ six_unlock_write(&n2->c.lock); -+ six_unlock_write(&n1->c.lock); -+ -+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); -+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, path1, n1); -+ -+ path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); -+ six_lock_increment(&n2->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, path2, n2); -+ -+ /* -+ * Note that on recursive parent_keys == keys, so we -+ * can't start adding new keys to parent_keys before emptying it -+ * out (which we did with btree_split_insert_keys() above) -+ */ -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ bch2_keylist_add(&as->parent_keys, &n2->key); -+ -+ if (!parent) { -+ /* Depth increases, make a new root */ -+ n3 = __btree_root_alloc(as, trans, b->c.level + 1); -+ -+ bch2_btree_update_add_new_node(as, n3); -+ six_unlock_write(&n3->c.lock); -+ -+ path2->locks_want++; -+ BUG_ON(btree_node_locked(path2, n3->c.level)); -+ six_lock_increment(&n3->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, path2, n3); -+ -+ n3->sib_u64s[0] = U16_MAX; -+ n3->sib_u64s[1] = U16_MAX; -+ -+ btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); -+ } -+ } else { -+ trace_and_count(c, btree_node_compact, c, b); -+ -+ n1 = bch2_btree_node_alloc_replacement(as, trans, b); -+ -+ if (keys) { -+ btree_split_insert_keys(as, trans, path, n1, keys); -+ BUG_ON(!bch2_keylist_empty(keys)); -+ } -+ -+ bch2_btree_build_aux_trees(n1); -+ bch2_btree_update_add_new_node(as, n1); -+ six_unlock_write(&n1->c.lock); -+ -+ path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); -+ six_lock_increment(&n1->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, path1, n1); -+ -+ if (parent) -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ } -+ -+ /* New nodes all written, now make them visible: */ -+ -+ if (parent) { -+ /* Split a non root node */ -+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); -+ if (ret) -+ goto err; -+ } else if (n3) { -+ bch2_btree_set_root(as, trans, path, n3); -+ } else { -+ /* Root filled up but didn't need to be split */ -+ bch2_btree_set_root(as, trans, path, n1); -+ } -+ -+ if (n3) { -+ bch2_btree_update_get_open_buckets(as, n3); -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); -+ } -+ if (n2) { -+ bch2_btree_update_get_open_buckets(as, n2); -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); -+ } -+ bch2_btree_update_get_open_buckets(as, n1); -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); -+ -+ /* -+ * The old node must be freed (in memory) _before_ unlocking the new -+ * nodes - else another thread could re-acquire a read lock on the old -+ * node after another thread has locked and updated the new node, thus -+ * seeing stale data: -+ */ -+ bch2_btree_node_free_inmem(trans, path, b); -+ -+ if (n3) -+ bch2_trans_node_add(trans, n3); -+ if (n2) -+ bch2_trans_node_add(trans, n2); -+ bch2_trans_node_add(trans, n1); -+ -+ if (n3) -+ six_unlock_intent(&n3->c.lock); -+ if (n2) -+ six_unlock_intent(&n2->c.lock); -+ six_unlock_intent(&n1->c.lock); -+out: -+ if (path2) { -+ __bch2_btree_path_unlock(trans, path2); -+ bch2_path_put(trans, path2, true); -+ } -+ if (path1) { -+ __bch2_btree_path_unlock(trans, path1); -+ bch2_path_put(trans, path1, true); -+ } -+ -+ bch2_trans_verify_locks(trans); -+ -+ bch2_time_stats_update(&c->times[n2 -+ ? BCH_TIME_btree_node_split -+ : BCH_TIME_btree_node_compact], -+ start_time); -+ return ret; -+err: -+ if (n3) -+ bch2_btree_node_free_never_used(as, trans, n3); -+ if (n2) -+ bch2_btree_node_free_never_used(as, trans, n2); -+ bch2_btree_node_free_never_used(as, trans, n1); -+ goto out; -+} -+ -+static void -+bch2_btree_insert_keys_interior(struct btree_update *as, -+ struct btree_trans *trans, -+ struct btree_path *path, -+ struct btree *b, -+ struct keylist *keys) -+{ -+ struct btree_path *linked; -+ -+ __bch2_btree_insert_keys_interior(as, trans, path, b, -+ path->l[b->c.level].iter, keys); -+ -+ btree_update_updated_node(as, b); -+ -+ trans_for_each_path_with_node(trans, b, linked) -+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); -+ -+ bch2_trans_verify_paths(trans); -+} -+ -+/** -+ * bch_btree_insert_node - insert bkeys into a given btree node -+ * -+ * @iter: btree iterator -+ * @keys: list of keys to insert -+ * @hook: insert callback -+ * @persistent: if not null, @persistent will wait on journal write -+ * -+ * Inserts as many keys as it can into a given btree node, splitting it if full. -+ * If a split occurred, this function will return early. This can only happen -+ * for leaf nodes -- inserts into interior nodes have to be atomic. -+ */ -+static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, -+ struct btree_path *path, struct btree *b, -+ struct keylist *keys, unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ int ret; -+ -+ lockdep_assert_held(&c->gc_lock); -+ BUG_ON(!btree_node_intent_locked(path, b->c.level)); -+ BUG_ON(!b->c.level); -+ BUG_ON(!as || as->b); -+ bch2_verify_keylist_sorted(keys); -+ -+ ret = bch2_btree_node_lock_write(trans, path, &b->c); -+ if (ret) -+ return ret; -+ -+ bch2_btree_node_prep_for_write(trans, path, b); -+ -+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { -+ bch2_btree_node_unlock_write(trans, path, b); -+ goto split; -+ } -+ -+ btree_node_interior_verify(c, b); -+ -+ bch2_btree_insert_keys_interior(as, trans, path, b, keys); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_trans_node_reinit_iter(trans, b); -+ -+ bch2_btree_node_unlock_write(trans, path, b); -+ -+ btree_node_interior_verify(c, b); -+ return 0; -+split: -+ /* -+ * We could attempt to avoid the transaction restart, by calling -+ * bch2_btree_path_upgrade() and allocating more nodes: -+ */ -+ if (b->c.level >= as->update_level) { -+ trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); -+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); -+ } -+ -+ return btree_split(as, trans, path, b, keys, flags); -+} -+ -+int bch2_btree_split_leaf(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned flags) -+{ -+ struct btree *b = path_l(path)->b; -+ struct btree_update *as; -+ unsigned l; -+ int ret = 0; -+ -+ as = bch2_btree_update_start(trans, path, path->level, -+ true, flags); -+ if (IS_ERR(as)) -+ return PTR_ERR(as); -+ -+ ret = btree_split(as, trans, path, b, NULL, flags); -+ if (ret) { -+ bch2_btree_update_free(as, trans); -+ return ret; -+ } -+ -+ bch2_btree_update_done(as, trans); -+ -+ for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) -+ ret = bch2_foreground_maybe_merge(trans, path, l, flags); -+ -+ return ret; -+} -+ -+int __bch2_foreground_maybe_merge(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned level, -+ unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *sib_path = NULL, *new_path = NULL; -+ struct btree_update *as; -+ struct bkey_format_state new_s; -+ struct bkey_format new_f; -+ struct bkey_i delete; -+ struct btree *b, *m, *n, *prev, *next, *parent; -+ struct bpos sib_pos; -+ size_t sib_u64s; -+ u64 start_time = local_clock(); -+ int ret = 0; -+ -+ BUG_ON(!path->should_be_locked); -+ BUG_ON(!btree_node_locked(path, level)); -+ -+ b = path->l[level].b; -+ -+ if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || -+ (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { -+ b->sib_u64s[sib] = U16_MAX; -+ return 0; -+ } -+ -+ sib_pos = sib == btree_prev_sib -+ ? bpos_predecessor(b->data->min_key) -+ : bpos_successor(b->data->max_key); -+ -+ sib_path = bch2_path_get(trans, path->btree_id, sib_pos, -+ U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); -+ ret = bch2_btree_path_traverse(trans, sib_path, false); -+ if (ret) -+ goto err; -+ -+ btree_path_set_should_be_locked(sib_path); -+ -+ m = sib_path->l[level].b; -+ -+ if (btree_node_parent(path, b) != -+ btree_node_parent(sib_path, m)) { -+ b->sib_u64s[sib] = U16_MAX; -+ goto out; -+ } -+ -+ if (sib == btree_prev_sib) { -+ prev = m; -+ next = b; -+ } else { -+ prev = b; -+ next = m; -+ } -+ -+ if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ -+ bch2_bpos_to_text(&buf1, prev->data->max_key); -+ bch2_bpos_to_text(&buf2, next->data->min_key); -+ bch_err(c, -+ "%s(): btree topology error:\n" -+ " prev ends at %s\n" -+ " next starts at %s", -+ __func__, buf1.buf, buf2.buf); -+ printbuf_exit(&buf1); -+ printbuf_exit(&buf2); -+ bch2_topology_error(c); -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_bkey_format_init(&new_s); -+ bch2_bkey_format_add_pos(&new_s, prev->data->min_key); -+ __bch2_btree_calc_format(&new_s, prev); -+ __bch2_btree_calc_format(&new_s, next); -+ bch2_bkey_format_add_pos(&new_s, next->data->max_key); -+ new_f = bch2_bkey_format_done(&new_s); -+ -+ sib_u64s = btree_node_u64s_with_format(b, &new_f) + -+ btree_node_u64s_with_format(m, &new_f); -+ -+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { -+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ sib_u64s /= 2; -+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ } -+ -+ sib_u64s = min(sib_u64s, btree_max_u64s(c)); -+ sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); -+ b->sib_u64s[sib] = sib_u64s; -+ -+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) -+ goto out; -+ -+ parent = btree_node_parent(path, b); -+ as = bch2_btree_update_start(trans, path, level, false, -+ BTREE_INSERT_NOFAIL|flags); -+ ret = PTR_ERR_OR_ZERO(as); -+ if (ret) -+ goto err; -+ -+ trace_and_count(c, btree_node_merge, c, b); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ bch2_btree_interior_update_will_free_node(as, m); -+ -+ n = bch2_btree_node_alloc(as, trans, b->c.level); -+ -+ SET_BTREE_NODE_SEQ(n->data, -+ max(BTREE_NODE_SEQ(b->data), -+ BTREE_NODE_SEQ(m->data)) + 1); -+ -+ btree_set_min(n, prev->data->min_key); -+ btree_set_max(n, next->data->max_key); -+ -+ n->data->format = new_f; -+ btree_node_set_format(n, new_f); -+ -+ bch2_btree_sort_into(c, n, prev); -+ bch2_btree_sort_into(c, n, next); -+ -+ bch2_btree_build_aux_trees(n); -+ bch2_btree_update_add_new_node(as, n); -+ six_unlock_write(&n->c.lock); -+ -+ new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); -+ six_lock_increment(&n->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, new_path, n); -+ -+ bkey_init(&delete.k); -+ delete.k.p = prev->key.k.p; -+ bch2_keylist_add(&as->parent_keys, &delete); -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ -+ bch2_trans_verify_paths(trans); -+ -+ ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); -+ if (ret) -+ goto err_free_update; -+ -+ bch2_trans_verify_paths(trans); -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ -+ bch2_btree_node_free_inmem(trans, path, b); -+ bch2_btree_node_free_inmem(trans, sib_path, m); -+ -+ bch2_trans_node_add(trans, n); -+ -+ bch2_trans_verify_paths(trans); -+ -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as, trans); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); -+out: -+err: -+ if (new_path) -+ bch2_path_put(trans, new_path, true); -+ bch2_path_put(trans, sib_path, true); -+ bch2_trans_verify_locks(trans); -+ return ret; -+err_free_update: -+ bch2_btree_node_free_never_used(as, trans, n); -+ bch2_btree_update_free(as, trans); -+ goto out; -+} -+ -+/** -+ * bch_btree_node_rewrite - Rewrite/move a btree node -+ */ -+int bch2_btree_node_rewrite(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct btree *b, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *new_path = NULL; -+ struct btree *n, *parent; -+ struct btree_update *as; -+ int ret; -+ -+ flags |= BTREE_INSERT_NOFAIL; -+ -+ parent = btree_node_parent(iter->path, b); -+ as = bch2_btree_update_start(trans, iter->path, b->c.level, -+ false, flags); -+ ret = PTR_ERR_OR_ZERO(as); -+ if (ret) -+ goto out; -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n = bch2_btree_node_alloc_replacement(as, trans, b); -+ -+ bch2_btree_build_aux_trees(n); -+ bch2_btree_update_add_new_node(as, n); -+ six_unlock_write(&n->c.lock); -+ -+ new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); -+ six_lock_increment(&n->c.lock, SIX_LOCK_intent); -+ mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); -+ bch2_btree_path_level_init(trans, new_path, n); -+ -+ trace_and_count(c, btree_node_rewrite, c, b); -+ -+ if (parent) { -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ ret = bch2_btree_insert_node(as, trans, iter->path, parent, -+ &as->parent_keys, flags); -+ if (ret) -+ goto err; -+ } else { -+ bch2_btree_set_root(as, trans, iter->path, n); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); -+ -+ bch2_btree_node_free_inmem(trans, iter->path, b); -+ -+ bch2_trans_node_add(trans, n); -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as, trans); -+out: -+ if (new_path) -+ bch2_path_put(trans, new_path, true); -+ bch2_btree_path_downgrade(trans, iter->path); -+ return ret; -+err: -+ bch2_btree_node_free_never_used(as, trans, n); -+ bch2_btree_update_free(as, trans); -+ goto out; -+} -+ -+struct async_btree_rewrite { -+ struct bch_fs *c; -+ struct work_struct work; -+ struct list_head list; -+ enum btree_id btree_id; -+ unsigned level; -+ struct bpos pos; -+ __le64 seq; -+}; -+ -+static int async_btree_node_rewrite_trans(struct btree_trans *trans, -+ struct async_btree_rewrite *a) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct btree *b; -+ int ret; -+ -+ bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, -+ BTREE_MAX_DEPTH, a->level, 0); -+ b = bch2_btree_iter_peek_node(&iter); -+ ret = PTR_ERR_OR_ZERO(b); -+ if (ret) -+ goto out; -+ -+ if (!b || b->data->keys.seq != a->seq) { -+ struct printbuf buf = PRINTBUF; -+ -+ if (b) -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ else -+ prt_str(&buf, "(null"); -+ bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", -+ __func__, a->seq, buf.buf); -+ printbuf_exit(&buf); -+ goto out; -+ } -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static void async_btree_node_rewrite_work(struct work_struct *work) -+{ -+ struct async_btree_rewrite *a = -+ container_of(work, struct async_btree_rewrite, work); -+ struct bch_fs *c = a->c; -+ int ret; -+ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ async_btree_node_rewrite_trans(&trans, a)); -+ if (ret) -+ bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); -+ bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); -+ kfree(a); -+} -+ -+void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) -+{ -+ struct async_btree_rewrite *a; -+ int ret; -+ -+ a = kmalloc(sizeof(*a), GFP_NOFS); -+ if (!a) { -+ bch_err(c, "%s: error allocating memory", __func__); -+ return; -+ } -+ -+ a->c = c; -+ a->btree_id = b->c.btree_id; -+ a->level = b->c.level; -+ a->pos = b->key.k.p; -+ a->seq = b->data->keys.seq; -+ INIT_WORK(&a->work, async_btree_node_rewrite_work); -+ -+ if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { -+ mutex_lock(&c->pending_node_rewrites_lock); -+ list_add(&a->list, &c->pending_node_rewrites); -+ mutex_unlock(&c->pending_node_rewrites_lock); -+ return; -+ } -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { -+ if (test_bit(BCH_FS_STARTED, &c->flags)) { -+ bch_err(c, "%s: error getting c->writes ref", __func__); -+ kfree(a); -+ return; -+ } -+ -+ ret = bch2_fs_read_write_early(c); -+ if (ret) { -+ bch_err(c, "%s: error going read-write: %s", -+ __func__, bch2_err_str(ret)); -+ kfree(a); -+ return; -+ } -+ -+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); -+ } -+ -+ queue_work(c->btree_interior_update_worker, &a->work); -+} -+ -+void bch2_do_pending_node_rewrites(struct bch_fs *c) -+{ -+ struct async_btree_rewrite *a, *n; -+ -+ mutex_lock(&c->pending_node_rewrites_lock); -+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { -+ list_del(&a->list); -+ -+ bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); -+ queue_work(c->btree_interior_update_worker, &a->work); -+ } -+ mutex_unlock(&c->pending_node_rewrites_lock); -+} -+ -+void bch2_free_pending_node_rewrites(struct bch_fs *c) -+{ -+ struct async_btree_rewrite *a, *n; -+ -+ mutex_lock(&c->pending_node_rewrites_lock); -+ list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { -+ list_del(&a->list); -+ -+ kfree(a); -+ } -+ mutex_unlock(&c->pending_node_rewrites_lock); -+} -+ -+static int __bch2_btree_node_update_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct btree *b, struct btree *new_hash, -+ struct bkey_i *new_key, -+ unsigned commit_flags, -+ bool skip_triggers) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter2 = { NULL }; -+ struct btree *parent; -+ int ret; -+ -+ if (!skip_triggers) { -+ ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, -+ bkey_i_to_s_c(&b->key), 0); -+ if (ret) -+ return ret; -+ -+ ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, -+ new_key, 0); -+ if (ret) -+ return ret; -+ } -+ -+ if (new_hash) { -+ bkey_copy(&new_hash->key, new_key); -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, -+ new_hash, b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ } -+ -+ parent = btree_node_parent(iter->path, b); -+ if (parent) { -+ bch2_trans_copy_iter(&iter2, iter); -+ -+ iter2.path = bch2_btree_path_make_mut(trans, iter2.path, -+ iter2.flags & BTREE_ITER_INTENT, -+ _THIS_IP_); -+ -+ BUG_ON(iter2.path->level != b->c.level); -+ BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); -+ -+ btree_path_set_level_up(trans, iter2.path); -+ -+ trans->paths_sorted = false; -+ -+ ret = bch2_btree_iter_traverse(&iter2) ?: -+ bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); -+ if (ret) -+ goto err; -+ } else { -+ BUG_ON(btree_node_root(c, b) != b); -+ -+ ret = darray_make_room(&trans->extra_journal_entries, -+ jset_u64s(new_key->k.u64s)); -+ if (ret) -+ return ret; -+ -+ journal_entry_set((void *) &darray_top(trans->extra_journal_entries), -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ new_key, new_key->k.u64s); -+ trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); -+ } -+ -+ ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); -+ if (ret) -+ goto err; -+ -+ bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); -+ -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, new_key); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } else { -+ bkey_copy(&b->key, new_key); -+ } -+ -+ bch2_btree_node_unlock_write(trans, iter->path, b); -+out: -+ bch2_trans_iter_exit(trans, &iter2); -+ return ret; -+err: -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ mutex_unlock(&c->btree_cache.lock); -+ } -+ goto out; -+} -+ -+int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, -+ struct btree *b, struct bkey_i *new_key, -+ unsigned commit_flags, bool skip_triggers) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *new_hash = NULL; -+ struct btree_path *path = iter->path; -+ struct closure cl; -+ int ret = 0; -+ -+ ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); -+ if (ret) -+ return ret; -+ -+ closure_init_stack(&cl); -+ -+ /* -+ * check btree_ptr_hash_val() after @b is locked by -+ * btree_iter_traverse(): -+ */ -+ if (btree_ptr_hash_val(new_key) != b->hash_val) { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ if (ret) { -+ ret = drop_locks_do(trans, (closure_sync(&cl), 0)); -+ if (ret) -+ return ret; -+ } -+ -+ new_hash = bch2_btree_node_mem_alloc(trans, false); -+ } -+ -+ path->intent_ref++; -+ ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, -+ commit_flags, skip_triggers); -+ --path->intent_ref; -+ -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&new_hash->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ six_unlock_write(&new_hash->c.lock); -+ six_unlock_intent(&new_hash->c.lock); -+ } -+ closure_sync(&cl); -+ bch2_btree_cache_cannibalize_unlock(c); -+ return ret; -+} -+ -+int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, -+ struct btree *b, struct bkey_i *new_key, -+ unsigned commit_flags, bool skip_triggers) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, -+ BTREE_MAX_DEPTH, b->c.level, -+ BTREE_ITER_INTENT); -+ ret = bch2_btree_iter_traverse(&iter); -+ if (ret) -+ goto out; -+ -+ /* has node been freed? */ -+ if (iter.path->l[b->c.level].b != b) { -+ /* node has been freed: */ -+ BUG_ON(!btree_node_dying(b)); -+ goto out; -+ } -+ -+ BUG_ON(!btree_node_hashed(b)); -+ -+ ret = bch2_btree_node_update_key(trans, &iter, b, new_key, -+ commit_flags, skip_triggers); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* Init code: */ -+ -+/* -+ * Only for filesystem bringup, when first reading the btree roots or allocating -+ * btree roots when initializing a new filesystem: -+ */ -+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -+{ -+ BUG_ON(btree_node_root(c, b)); -+ -+ bch2_btree_set_root_inmem(c, b); -+} -+ -+static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) -+{ -+ struct bch_fs *c = trans->c; -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(trans, false); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ set_btree_node_fake(b); -+ set_btree_node_need_rewrite(b); -+ b->c.level = 0; -+ b->c.btree_id = id; -+ -+ bkey_btree_ptr_init(&b->key); -+ b->key.k.p = SPOS_MAX; -+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ bch2_btree_build_aux_trees(b); -+ -+ b->data->flags = 0; -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, SPOS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ btree_node_set_format(b, b->data->format); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, -+ b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ return 0; -+} -+ -+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) -+{ -+ bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id)); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_update *as; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each_entry(as, &c->btree_interior_update_list, list) -+ prt_printf(out, "%p m %u w %u r %u j %llu\n", -+ as, -+ as->mode, -+ as->nodes_written, -+ closure_nr_remaining(&as->cl), -+ as->journal.seq); -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static bool bch2_btree_interior_updates_pending(struct bch_fs *c) -+{ -+ bool ret; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ ret = !list_empty(&c->btree_interior_update_list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return ret; -+} -+ -+bool bch2_btree_interior_updates_flush(struct bch_fs *c) -+{ -+ bool ret = bch2_btree_interior_updates_pending(c); -+ -+ if (ret) -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_pending(c)); -+ return ret; -+} -+ -+void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) -+{ -+ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ r->level = entry->level; -+ r->alive = true; -+ bkey_copy(&r->key, &entry->start[0]); -+ -+ mutex_unlock(&c->btree_root_lock); -+} -+ -+struct jset_entry * -+bch2_btree_roots_to_journal_entries(struct bch_fs *c, -+ struct jset_entry *start, -+ struct jset_entry *end) -+{ -+ struct jset_entry *entry; -+ unsigned long have = 0; -+ unsigned i; -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) -+ __set_bit(entry->btree_id, &have); -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ for (i = 0; i < btree_id_nr_alive(c); i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (r->alive && !test_bit(i, &have)) { -+ journal_entry_set(end, BCH_JSET_ENTRY_btree_root, -+ i, r->level, &r->key, r->key.k.u64s); -+ end = vstruct_next(end); -+ } -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+ -+ return end; -+} -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -+{ -+ if (c->btree_interior_update_worker) -+ destroy_workqueue(c->btree_interior_update_worker); -+ mempool_exit(&c->btree_interior_update_pool); -+} -+ -+void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) -+{ -+ mutex_init(&c->btree_reserve_cache_lock); -+ INIT_LIST_HEAD(&c->btree_interior_update_list); -+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); -+ mutex_init(&c->btree_interior_update_lock); -+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); -+ -+ INIT_LIST_HEAD(&c->pending_node_rewrites); -+ mutex_init(&c->pending_node_rewrites_lock); -+} -+ -+int bch2_fs_btree_interior_update_init(struct bch_fs *c) -+{ -+ c->btree_interior_update_worker = -+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); -+ if (!c->btree_interior_update_worker) -+ return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; -+ -+ if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, -+ sizeof(struct btree_update))) -+ return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -new file mode 100644 -index 000000000..5e0a467fe ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,337 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+ -+#include "btree_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+ -+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, -+ struct bkey_format *); -+ -+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) -+ -+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) -+ -+/* -+ * Tracks an in progress split/rewrite of a btree node and the update to the -+ * parent node: -+ * -+ * When we split/rewrite a node, we do all the updates in memory without -+ * waiting for any writes to complete - we allocate the new node(s) and update -+ * the parent node, possibly recursively up to the root. -+ * -+ * The end result is that we have one or more new nodes being written - -+ * possibly several, if there were multiple splits - and then a write (updating -+ * an interior node) which will make all these new nodes visible. -+ * -+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old -+ * nodes can't be freed (their space on disk can't be reclaimed) until the -+ * update to the interior node that makes the new node visible completes - -+ * until then, the old nodes are still reachable on disk. -+ * -+ */ -+struct btree_update { -+ struct closure cl; -+ struct bch_fs *c; -+ u64 start_time; -+ -+ struct list_head list; -+ struct list_head unwritten_list; -+ -+ /* What kind of update are we doing? */ -+ enum { -+ BTREE_INTERIOR_NO_UPDATE, -+ BTREE_INTERIOR_UPDATING_NODE, -+ BTREE_INTERIOR_UPDATING_ROOT, -+ BTREE_INTERIOR_UPDATING_AS, -+ } mode; -+ -+ unsigned nodes_written:1; -+ unsigned took_gc_lock:1; -+ -+ enum btree_id btree_id; -+ unsigned update_level; -+ -+ struct disk_reservation disk_res; -+ struct journal_preres journal_preres; -+ -+ /* -+ * BTREE_INTERIOR_UPDATING_NODE: -+ * The update that made the new nodes visible was a regular update to an -+ * existing interior node - @b. We can't write out the update to @b -+ * until the new nodes we created are finished writing, so we block @b -+ * from writing by putting this btree_interior update on the -+ * @b->write_blocked list with @write_blocked_list: -+ */ -+ struct btree *b; -+ struct list_head write_blocked_list; -+ -+ /* -+ * We may be freeing nodes that were dirty, and thus had journal entries -+ * pinned: we need to transfer the oldest of those pins to the -+ * btree_update operation, and release it when the new node(s) -+ * are all persistent and reachable: -+ */ -+ struct journal_entry_pin journal; -+ -+ /* Preallocated nodes we reserve when we start the update: */ -+ struct prealloc_nodes { -+ struct btree *b[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr; -+ } prealloc_nodes[2]; -+ -+ /* Nodes being freed: */ -+ struct keylist old_keys; -+ u64 _old_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_U64s_MAX]; -+ -+ /* Nodes being added: */ -+ struct keylist new_keys; -+ u64 _new_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_U64s_MAX]; -+ -+ /* New nodes, that will be made reachable by this update: */ -+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_new_nodes; -+ -+ struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; -+ __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_old_nodes; -+ -+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * -+ BCH_REPLICAS_MAX]; -+ open_bucket_idx_t nr_open_buckets; -+ -+ unsigned journal_u64s; -+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; -+ -+ /* Only here to reduce stack usage on recursive splits: */ -+ struct keylist parent_keys; -+ /* -+ * Enough room for btree_split's keys without realloc - btree node -+ * pointers never have crc/compression info, so we only need to acount -+ * for the pointers for three keys -+ */ -+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -+}; -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, -+ struct btree_trans *, -+ struct btree *, -+ struct bkey_format); -+ -+int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); -+ -+int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, -+ unsigned, unsigned, enum btree_node_sibling); -+ -+static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned level, unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree *b; -+ -+ EBUG_ON(!btree_node_locked(path, level)); -+ -+ b = path->l[level].b; -+ if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) -+ return 0; -+ -+ return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); -+} -+ -+static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, -+ struct btree_path *path, -+ unsigned level, -+ unsigned flags) -+{ -+ return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, -+ btree_prev_sib) ?: -+ bch2_foreground_maybe_merge_sibling(trans, path, level, flags, -+ btree_next_sib); -+} -+ -+int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, -+ struct btree *, unsigned); -+void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); -+int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, -+ struct btree *, struct bkey_i *, -+ unsigned, bool); -+int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, -+ struct bkey_i *, unsigned, bool); -+ -+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); -+ -+static inline unsigned btree_update_reserve_required(struct bch_fs *c, -+ struct btree *b) -+{ -+ unsigned depth = btree_node_root(c, b)->c.level + 1; -+ -+ /* -+ * Number of nodes we might have to allocate in a worst case btree -+ * split operation - we split all the way up to the root, then allocate -+ * a new root, unless we're already at max depth: -+ */ -+ if (depth < BTREE_MAX_DEPTH) -+ return (depth - b->c.level) * 2 + 1; -+ else -+ return (depth - b->c.level) * 2 - 1; -+} -+ -+static inline void btree_node_reset_sib_u64s(struct btree *b) -+{ -+ b->sib_u64s[0] = b->nr.live_u64s; -+ b->sib_u64s[1] = b->nr.live_u64s; -+} -+ -+static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -+{ -+ return (void *) b->data + btree_bytes(c); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, -+ struct btree *b) -+{ -+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, -+ struct btree *b) -+{ -+ return btree_data_end(c, b); -+} -+ -+static inline void *write_block(struct btree *b) -+{ -+ return (void *) b->data + (b->written << 9); -+} -+ -+static inline bool __btree_addr_written(struct btree *b, void *p) -+{ -+ return p < write_block(b); -+} -+ -+static inline bool bset_written(struct btree *b, struct bset *i) -+{ -+ return __btree_addr_written(b, i); -+} -+ -+static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -+{ -+ return __btree_addr_written(b, k); -+} -+ -+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, -+ struct btree *b, -+ void *end) -+{ -+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + -+ b->whiteout_u64s; -+ ssize_t total = c->opts.btree_node_size >> 3; -+ -+ /* Always leave one extra u64 for bch2_varint_decode: */ -+ used++; -+ -+ return total - used; -+} -+ -+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, -+ struct btree *b) -+{ -+ ssize_t remaining = __bch_btree_u64s_remaining(c, b, -+ btree_bkey_last(b, bset_tree_last(b))); -+ -+ BUG_ON(remaining < 0); -+ -+ if (bset_written(b, btree_bset_last(b))) -+ return 0; -+ -+ return remaining; -+} -+ -+#define BTREE_WRITE_SET_U64s_BITS 9 -+ -+static inline unsigned btree_write_set_buffer(struct btree *b) -+{ -+ /* -+ * Could buffer up larger amounts of keys for btrees with larger keys, -+ * pending benchmarking: -+ */ -+ return 8 << BTREE_WRITE_SET_U64s_BITS; -+} -+ -+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, -+ struct btree *b) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ struct btree_node_entry *bne = max(write_block(b), -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ ssize_t remaining_space = -+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); -+ -+ if (unlikely(bset_written(b, bset(b, t)))) { -+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) -+ return bne; -+ } else { -+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) -+ return bne; -+ } -+ -+ return NULL; -+} -+ -+static inline void push_whiteout(struct bch_fs *c, struct btree *b, -+ struct bpos pos) -+{ -+ struct bkey_packed k; -+ -+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); -+ EBUG_ON(btree_node_just_written(b)); -+ -+ if (!bkey_pack_pos(&k, pos, b)) { -+ struct bkey *u = (void *) &k; -+ -+ bkey_init(u); -+ u->p = pos; -+ } -+ -+ k.needs_whiteout = true; -+ -+ b->whiteout_u64s += k.u64s; -+ bkey_copy(unwritten_whiteouts_start(c, b), &k); -+} -+ -+/* -+ * write lock must be held on @b (else the dirty bset that we were going to -+ * insert into could be written out from under us) -+ */ -+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, -+ struct btree *b, unsigned u64s) -+{ -+ if (unlikely(btree_node_need_rewrite(b))) -+ return false; -+ -+ return u64s <= bch_btree_keys_u64s_remaining(c, b); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); -+ -+bool bch2_btree_interior_updates_flush(struct bch_fs *); -+ -+void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); -+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, -+ struct jset_entry *, struct jset_entry *); -+ -+void bch2_do_pending_node_rewrites(struct bch_fs *); -+void bch2_free_pending_node_rewrites(struct bch_fs *); -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *); -+void bch2_fs_btree_interior_update_init_early(struct bch_fs *); -+int bch2_fs_btree_interior_update_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ -diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c -new file mode 100644 -index 000000000..6d2d43b6f ---- /dev/null -+++ b/fs/bcachefs/btree_write_buffer.c -@@ -0,0 +1,375 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_write_buffer.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+ -+#include -+ -+static int btree_write_buffered_key_cmp(const void *_l, const void *_r) -+{ -+ const struct btree_write_buffered_key *l = _l; -+ const struct btree_write_buffered_key *r = _r; -+ -+ return cmp_int(l->btree, r->btree) ?: -+ bpos_cmp(l->k.k.p, r->k.k.p) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) -+{ -+ const struct btree_write_buffered_key *l = _l; -+ const struct btree_write_buffered_key *r = _r; -+ -+ return cmp_int(l->journal_seq, r->journal_seq); -+} -+ -+static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct btree_write_buffered_key *wb, -+ unsigned commit_flags, -+ bool *write_locked, -+ size_t *fast) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_path *path; -+ int ret; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ path = iter->path; -+ -+ if (!*write_locked) { -+ ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); -+ if (ret) -+ return ret; -+ -+ bch2_btree_node_prep_for_write(trans, path, path->l[0].b); -+ *write_locked = true; -+ } -+ -+ if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { -+ bch2_btree_node_unlock_write(trans, path, path->l[0].b); -+ *write_locked = false; -+ goto trans_commit; -+ } -+ -+ bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); -+ (*fast)++; -+ -+ if (path->ref > 1) { -+ /* -+ * We can't clone a path that has write locks: if the path is -+ * shared, unlock before set_pos(), traverse(): -+ */ -+ bch2_btree_node_unlock_write(trans, path, path->l[0].b); -+ *write_locked = false; -+ } -+ return 0; -+trans_commit: -+ return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ commit_flags| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_JOURNAL_RECLAIM); -+} -+ -+static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) -+{ -+ union btree_write_buffer_state old, new; -+ u64 v = READ_ONCE(wb->state.v); -+ -+ do { -+ old.v = new.v = v; -+ -+ new.nr = 0; -+ new.idx++; -+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); -+ -+ while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) -+ cpu_relax(); -+ -+ smp_mb(); -+ -+ return old; -+} -+ -+/* -+ * Update a btree with a write buffered key using the journal seq of the -+ * original write buffer insert. -+ * -+ * It is not safe to rejournal the key once it has been inserted into the write -+ * buffer because that may break recovery ordering. For example, the key may -+ * have already been modified in the active write buffer in a seq that comes -+ * before the current transaction. If we were to journal this key again and -+ * crash, recovery would process updates in the wrong order. -+ */ -+static int -+btree_write_buffered_insert(struct btree_trans *trans, -+ struct btree_write_buffered_key *wb) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k), -+ BTREE_ITER_CACHED|BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, -+ bool locked) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ struct journal_entry_pin pin; -+ struct btree_write_buffered_key *i, *keys; -+ struct btree_iter iter = { NULL }; -+ size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; -+ bool write_locked = false; -+ union btree_write_buffer_state s; -+ int ret = 0; -+ -+ memset(&pin, 0, sizeof(pin)); -+ -+ if (!locked && !mutex_trylock(&wb->flush_lock)) -+ return 0; -+ -+ bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); -+ bch2_journal_pin_drop(j, &wb->journal_pin); -+ -+ s = btree_write_buffer_switch(wb); -+ keys = wb->keys[s.idx]; -+ nr = s.nr; -+ -+ if (race_fault()) -+ goto slowpath; -+ -+ /* -+ * We first sort so that we can detect and skip redundant updates, and -+ * then we attempt to flush in sorted btree order, as this is most -+ * efficient. -+ * -+ * However, since we're not flushing in the order they appear in the -+ * journal we won't be able to drop our journal pin until everything is -+ * flushed - which means this could deadlock the journal if we weren't -+ * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail -+ * if it would block taking a journal reservation. -+ * -+ * If that happens, simply skip the key so we can optimistically insert -+ * as many keys as possible in the fast path. -+ */ -+ sort(keys, nr, sizeof(keys[0]), -+ btree_write_buffered_key_cmp, NULL); -+ -+ for (i = keys; i < keys + nr; i++) { -+ if (i + 1 < keys + nr && -+ i[0].btree == i[1].btree && -+ bpos_eq(i[0].k.k.p, i[1].k.k.p)) { -+ skipped++; -+ i->journal_seq = 0; -+ continue; -+ } -+ -+ if (write_locked && -+ (iter.path->btree_id != i->btree || -+ bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { -+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); -+ write_locked = false; -+ } -+ -+ if (!iter.path || iter.path->btree_id != i->btree) { -+ bch2_trans_iter_exit(trans, &iter); -+ bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, -+ BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS); -+ } -+ -+ bch2_btree_iter_set_pos(&iter, i->k.k.p); -+ iter.path->preserve = false; -+ -+ do { -+ ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, -+ commit_flags, &write_locked, &fast); -+ if (!write_locked) -+ bch2_trans_begin(trans); -+ } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); -+ -+ if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { -+ slowpath++; -+ continue; -+ } -+ if (ret) -+ break; -+ -+ i->journal_seq = 0; -+ } -+ -+ if (write_locked) -+ bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); -+ -+ if (slowpath) -+ goto slowpath; -+ -+ bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); -+out: -+ bch2_journal_pin_drop(j, &pin); -+ mutex_unlock(&wb->flush_lock); -+ return ret; -+slowpath: -+ trace_write_buffer_flush_slowpath(trans, i - keys, nr); -+ -+ /* -+ * Now sort the rest by journal seq and bump the journal pin as we go. -+ * The slowpath zapped the seq of keys that were successfully flushed so -+ * we can skip those here. -+ */ -+ sort(keys, nr, sizeof(keys[0]), -+ btree_write_buffered_journal_cmp, -+ NULL); -+ -+ commit_flags &= ~BCH_WATERMARK_MASK; -+ commit_flags |= BCH_WATERMARK_reclaim; -+ -+ for (i = keys; i < keys + nr; i++) { -+ if (!i->journal_seq) -+ continue; -+ -+ if (i->journal_seq > pin.seq) { -+ struct journal_entry_pin pin2; -+ -+ memset(&pin2, 0, sizeof(pin2)); -+ -+ bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); -+ bch2_journal_pin_drop(j, &pin); -+ bch2_journal_pin_copy(j, &pin, &pin2, NULL); -+ bch2_journal_pin_drop(j, &pin2); -+ } -+ -+ ret = commit_do(trans, NULL, NULL, -+ commit_flags| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_JOURNAL_RECLAIM, -+ btree_write_buffered_insert(trans, i)); -+ if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) -+ break; -+ } -+ -+ goto out; -+} -+ -+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) -+{ -+ bch2_trans_unlock(trans); -+ mutex_lock(&trans->c->btree_write_buffer.flush_lock); -+ return __bch2_btree_write_buffer_flush(trans, 0, true); -+} -+ -+int bch2_btree_write_buffer_flush(struct btree_trans *trans) -+{ -+ return __bch2_btree_write_buffer_flush(trans, 0, false); -+} -+ -+static int bch2_btree_write_buffer_journal_flush(struct journal *j, -+ struct journal_entry_pin *_pin, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ -+ mutex_lock(&wb->flush_lock); -+ -+ return bch2_trans_run(c, -+ __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); -+} -+ -+static inline u64 btree_write_buffer_ref(int idx) -+{ -+ return ((union btree_write_buffer_state) { -+ .ref0 = idx == 0, -+ .ref1 = idx == 1, -+ }).v; -+} -+ -+int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ struct btree_write_buffered_key *i; -+ union btree_write_buffer_state old, new; -+ int ret = 0; -+ u64 v; -+ -+ trans_for_each_wb_update(trans, i) { -+ EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); -+ -+ i->journal_seq = trans->journal_res.seq; -+ i->journal_offset = trans->journal_res.offset; -+ } -+ -+ preempt_disable(); -+ v = READ_ONCE(wb->state.v); -+ do { -+ old.v = new.v = v; -+ -+ new.v += btree_write_buffer_ref(new.idx); -+ new.nr += trans->nr_wb_updates; -+ if (new.nr > wb->size) { -+ ret = -BCH_ERR_btree_insert_need_flush_buffer; -+ goto out; -+ } -+ } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); -+ -+ memcpy(wb->keys[new.idx] + old.nr, -+ trans->wb_updates, -+ sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); -+ -+ bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, -+ bch2_btree_write_buffer_journal_flush); -+ -+ atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); -+out: -+ preempt_enable(); -+ return ret; -+} -+ -+void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) -+{ -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ -+ BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); -+ -+ kvfree(wb->keys[1]); -+ kvfree(wb->keys[0]); -+} -+ -+int bch2_fs_btree_write_buffer_init(struct bch_fs *c) -+{ -+ struct btree_write_buffer *wb = &c->btree_write_buffer; -+ -+ mutex_init(&wb->flush_lock); -+ wb->size = c->opts.btree_write_buffer_size; -+ -+ wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); -+ wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); -+ if (!wb->keys[0] || !wb->keys[1]) -+ return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h -new file mode 100644 -index 000000000..322df1c83 ---- /dev/null -+++ b/fs/bcachefs/btree_write_buffer.h -@@ -0,0 +1,14 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H -+#define _BCACHEFS_BTREE_WRITE_BUFFER_H -+ -+int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); -+int bch2_btree_write_buffer_flush_sync(struct btree_trans *); -+int bch2_btree_write_buffer_flush(struct btree_trans *); -+ -+int bch2_btree_insert_keys_write_buffer(struct btree_trans *); -+ -+void bch2_fs_btree_write_buffer_exit(struct bch_fs *); -+int bch2_fs_btree_write_buffer_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ -diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h -new file mode 100644 -index 000000000..99993ba77 ---- /dev/null -+++ b/fs/bcachefs/btree_write_buffer_types.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -+#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H -+ -+#include "journal_types.h" -+ -+#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 -+#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) -+ -+struct btree_write_buffered_key { -+ u64 journal_seq; -+ unsigned journal_offset; -+ enum btree_id btree; -+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); -+}; -+ -+union btree_write_buffer_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 nr:23; -+ u64 idx:1; -+ u64 ref0:20; -+ u64 ref1:20; -+ }; -+}; -+ -+struct btree_write_buffer { -+ struct mutex flush_lock; -+ struct journal_entry_pin journal_pin; -+ -+ union btree_write_buffer_state state; -+ size_t size; -+ -+ struct btree_write_buffered_key *keys[2]; -+}; -+ -+#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -new file mode 100644 -index 000000000..c02c8c917 ---- /dev/null -+++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2107 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "backpointers.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "buckets_waiting_for_journal.h" -+#include "ec.h" -+#include "error.h" -+#include "inode.h" -+#include "movinggc.h" -+#include "recovery.h" -+#include "reflink.h" -+#include "replicas.h" -+#include "subvolume.h" -+#include "trace.h" -+ -+#include -+ -+static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, -+ enum bch_data_type data_type, -+ s64 sectors) -+{ -+ switch (data_type) { -+ case BCH_DATA_btree: -+ fs_usage->btree += sectors; -+ break; -+ case BCH_DATA_user: -+ case BCH_DATA_parity: -+ fs_usage->data += sectors; -+ break; -+ case BCH_DATA_cached: -+ fs_usage->cached += sectors; -+ break; -+ default: -+ break; -+ } -+} -+ -+void bch2_fs_usage_initialize(struct bch_fs *c) -+{ -+ struct bch_fs_usage *usage; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ usage = c->usage_base; -+ -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ bch2_fs_usage_acc_to_base(c, i); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ usage->reserved += usage->persistent_reserved[i]; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); -+ } -+ -+ for_each_member_device(ca, c, i) { -+ struct bch_dev_usage dev = bch2_dev_usage_read(ca); -+ -+ usage->hidden += (dev.d[BCH_DATA_sb].buckets + -+ dev.d[BCH_DATA_journal].buckets) * -+ ca->mi.bucket_size; -+ } -+ -+ percpu_up_write(&c->mark_lock); -+} -+ -+static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, -+ unsigned journal_seq, -+ bool gc) -+{ -+ BUG_ON(!gc && !journal_seq); -+ -+ return this_cpu_ptr(gc -+ ? ca->usage_gc -+ : ca->usage[journal_seq & JOURNAL_BUF_MASK]); -+} -+ -+void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) -+{ -+ struct bch_fs *c = ca->fs; -+ unsigned seq, i, u64s = dev_usage_u64s(); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ memcpy(usage, ca->usage_base, u64s * sizeof(u64)); -+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) -+ acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+} -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -+{ -+ ssize_t offset = v - (u64 *) c->usage_base; -+ unsigned i, seq; -+ u64 ret; -+ -+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ ret = *v; -+ -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) -+{ -+ struct bch_fs_usage_online *ret; -+ unsigned nr_replicas = READ_ONCE(c->replicas.nr); -+ unsigned seq, i; -+retry: -+ ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); -+ if (unlikely(!ret)) -+ return NULL; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ if (nr_replicas != c->replicas.nr) { -+ nr_replicas = c->replicas.nr; -+ percpu_up_read(&c->mark_lock); -+ kfree(ret); -+ goto retry; -+ } -+ -+ ret->online_reserved = percpu_u64_get(c->online_reserved); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ unsafe_memcpy(&ret->u, c->usage_base, -+ __fs_usage_u64s(nr_replicas) * sizeof(u64), -+ "embedded variable length struct"); -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], -+ __fs_usage_u64s(nr_replicas)); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -+{ -+ struct bch_dev *ca; -+ unsigned i, u64s = fs_usage_u64s(c); -+ -+ BUG_ON(idx >= ARRAY_SIZE(c->usage)); -+ -+ preempt_disable(); -+ write_seqcount_begin(&c->usage_lock); -+ -+ acc_u64s_percpu((u64 *) c->usage_base, -+ (u64 __percpu *) c->usage[idx], u64s); -+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, NULL) { -+ u64s = dev_usage_u64s(); -+ -+ acc_u64s_percpu((u64 *) ca->usage_base, -+ (u64 __percpu *) ca->usage[idx], u64s); -+ percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); -+ } -+ rcu_read_unlock(); -+ -+ write_seqcount_end(&c->usage_lock); -+ preempt_enable(); -+} -+ -+void bch2_fs_usage_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_fs_usage_online *fs_usage) -+{ -+ unsigned i; -+ -+ prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); -+ -+ prt_printf(out, "hidden:\t\t\t\t%llu\n", -+ fs_usage->u.hidden); -+ prt_printf(out, "data:\t\t\t\t%llu\n", -+ fs_usage->u.data); -+ prt_printf(out, "cached:\t\t\t\t%llu\n", -+ fs_usage->u.cached); -+ prt_printf(out, "reserved:\t\t\t%llu\n", -+ fs_usage->u.reserved); -+ prt_printf(out, "nr_inodes:\t\t\t%llu\n", -+ fs_usage->u.nr_inodes); -+ prt_printf(out, "online reserved:\t\t%llu\n", -+ fs_usage->online_reserved); -+ -+ for (i = 0; -+ i < ARRAY_SIZE(fs_usage->u.persistent_reserved); -+ i++) { -+ prt_printf(out, "%u replicas:\n", i + 1); -+ prt_printf(out, "\treserved:\t\t%llu\n", -+ fs_usage->u.persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ prt_printf(out, "\t"); -+ bch2_replicas_entry_to_text(out, e); -+ prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); -+ } -+} -+ -+static u64 reserve_factor(u64 r) -+{ -+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -+} -+ -+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) -+{ -+ return min(fs_usage->u.hidden + -+ fs_usage->u.btree + -+ fs_usage->u.data + -+ reserve_factor(fs_usage->u.reserved + -+ fs_usage->online_reserved), -+ c->capacity); -+} -+ -+static struct bch_fs_usage_short -+__bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ u64 data, reserved; -+ -+ ret.capacity = c->capacity - -+ bch2_fs_usage_read_one(c, &c->usage_base->hidden); -+ -+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + -+ bch2_fs_usage_read_one(c, &c->usage_base->btree); -+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + -+ percpu_u64_get(c->online_reserved); -+ -+ ret.used = min(ret.capacity, data + reserve_factor(reserved)); -+ ret.free = ret.capacity - ret.used; -+ -+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); -+ -+ return ret; -+} -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = __bch2_fs_usage_read_short(c); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+void bch2_dev_usage_init(struct bch_dev *ca) -+{ -+ ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; -+} -+ -+static inline int bucket_sectors_fragmented(struct bch_dev *ca, -+ struct bch_alloc_v4 a) -+{ -+ return a.dirty_sectors -+ ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) -+ : 0; -+} -+ -+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, -+ struct bch_alloc_v4 old, -+ struct bch_alloc_v4 new, -+ u64 journal_seq, bool gc) -+{ -+ struct bch_fs_usage *fs_usage; -+ struct bch_dev_usage *u; -+ -+ preempt_disable(); -+ fs_usage = fs_usage_ptr(c, journal_seq, gc); -+ -+ if (data_type_is_hidden(old.data_type)) -+ fs_usage->hidden -= ca->mi.bucket_size; -+ if (data_type_is_hidden(new.data_type)) -+ fs_usage->hidden += ca->mi.bucket_size; -+ -+ u = dev_usage_ptr(ca, journal_seq, gc); -+ -+ u->d[old.data_type].buckets--; -+ u->d[new.data_type].buckets++; -+ -+ u->buckets_ec -= (int) !!old.stripe; -+ u->buckets_ec += (int) !!new.stripe; -+ -+ u->d[old.data_type].sectors -= old.dirty_sectors; -+ u->d[new.data_type].sectors += new.dirty_sectors; -+ -+ u->d[BCH_DATA_cached].sectors += new.cached_sectors; -+ u->d[BCH_DATA_cached].sectors -= old.cached_sectors; -+ -+ u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); -+ u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); -+ -+ preempt_enable(); -+} -+ -+static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, -+ struct bucket old, struct bucket new, -+ u64 journal_seq, bool gc) -+{ -+ struct bch_alloc_v4 old_a = { -+ .gen = old.gen, -+ .data_type = old.data_type, -+ .dirty_sectors = old.dirty_sectors, -+ .cached_sectors = old.cached_sectors, -+ .stripe = old.stripe, -+ }; -+ struct bch_alloc_v4 new_a = { -+ .gen = new.gen, -+ .data_type = new.data_type, -+ .dirty_sectors = new.dirty_sectors, -+ .cached_sectors = new.cached_sectors, -+ .stripe = new.stripe, -+ }; -+ -+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); -+} -+ -+static inline int __update_replicas(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ int idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) -+ return -1; -+ -+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); -+ fs_usage->replicas[idx] += sectors; -+ return 0; -+} -+ -+static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_replicas_entry *r, s64 sectors, -+ unsigned journal_seq, bool gc) -+{ -+ struct bch_fs_usage *fs_usage; -+ int idx, ret = 0; -+ struct printbuf buf = PRINTBUF; -+ -+ percpu_down_read(&c->mark_lock); -+ buf.atomic++; -+ -+ idx = bch2_replicas_entry_idx(c, r); -+ if (idx < 0 && -+ fsck_err(c, "no replicas entry\n" -+ " while marking %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ percpu_up_read(&c->mark_lock); -+ ret = bch2_mark_replicas(c, r); -+ percpu_down_read(&c->mark_lock); -+ -+ if (ret) -+ goto err; -+ idx = bch2_replicas_entry_idx(c, r); -+ } -+ if (idx < 0) { -+ ret = -1; -+ goto err; -+ } -+ -+ preempt_disable(); -+ fs_usage = fs_usage_ptr(c, journal_seq, gc); -+ fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); -+ fs_usage->replicas[idx] += sectors; -+ preempt_enable(); -+err: -+fsck_err: -+ percpu_up_read(&c->mark_lock); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static inline int update_cached_sectors(struct bch_fs *c, -+ struct bkey_s_c k, -+ unsigned dev, s64 sectors, -+ unsigned journal_seq, bool gc) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ return update_replicas(c, k, &r.e, sectors, journal_seq, gc); -+} -+ -+static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, -+ gfp_t gfp) -+{ -+ struct replicas_delta_list *d = trans->fs_usage_deltas; -+ unsigned new_size = d ? (d->size + more) * 2 : 128; -+ unsigned alloc_size = sizeof(*d) + new_size; -+ -+ WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); -+ -+ if (!d || d->used + more > d->size) { -+ d = krealloc(d, alloc_size, gfp|__GFP_ZERO); -+ -+ if (unlikely(!d)) { -+ if (alloc_size > REPLICAS_DELTA_LIST_MAX) -+ return -ENOMEM; -+ -+ d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); -+ if (!d) -+ return -ENOMEM; -+ -+ memset(d, 0, REPLICAS_DELTA_LIST_MAX); -+ -+ if (trans->fs_usage_deltas) -+ memcpy(d, trans->fs_usage_deltas, -+ trans->fs_usage_deltas->size + sizeof(*d)); -+ -+ new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); -+ kfree(trans->fs_usage_deltas); -+ } -+ -+ d->size = new_size; -+ trans->fs_usage_deltas = d; -+ } -+ -+ return 0; -+} -+ -+int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -+{ -+ return allocate_dropping_locks_errcode(trans, -+ __replicas_deltas_realloc(trans, more, _gfp)); -+} -+ -+static inline int update_replicas_list(struct btree_trans *trans, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ struct replicas_delta_list *d; -+ struct replicas_delta *n; -+ unsigned b; -+ int ret; -+ -+ if (!sectors) -+ return 0; -+ -+ b = replicas_entry_bytes(r) + 8; -+ ret = bch2_replicas_deltas_realloc(trans, b); -+ if (ret) -+ return ret; -+ -+ d = trans->fs_usage_deltas; -+ n = (void *) d->d + d->used; -+ n->delta = sectors; -+ memcpy((void *) n + offsetof(struct replicas_delta, r), -+ r, replicas_entry_bytes(r)); -+ bch2_replicas_entry_sort(&n->r); -+ d->used += b; -+ return 0; -+} -+ -+static inline int update_cached_sectors_list(struct btree_trans *trans, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ return update_replicas_list(trans, &r.e, sectors); -+} -+ -+int bch2_mark_alloc(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ u64 journal_seq = trans->journal_res.seq; -+ u64 bucket_journal_seq; -+ struct bch_fs *c = trans->c; -+ struct bch_alloc_v4 old_a_convert, new_a_convert; -+ const struct bch_alloc_v4 *old_a, *new_a; -+ struct bch_dev *ca; -+ int ret = 0; -+ -+ /* -+ * alloc btree is read in by bch2_alloc_read, not gc: -+ */ -+ if ((flags & BTREE_TRIGGER_GC) && -+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) -+ return 0; -+ -+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, -+ "alloc key for invalid device or bucket")) -+ return -EIO; -+ -+ ca = bch_dev_bkey_exists(c, new.k->p.inode); -+ -+ old_a = bch2_alloc_to_v4(old, &old_a_convert); -+ new_a = bch2_alloc_to_v4(new, &new_a_convert); -+ -+ bucket_journal_seq = new_a->journal_seq; -+ -+ if ((flags & BTREE_TRIGGER_INSERT) && -+ data_type_is_empty(old_a->data_type) != -+ data_type_is_empty(new_a->data_type) && -+ new.k->type == KEY_TYPE_alloc_v4) { -+ struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; -+ -+ EBUG_ON(!journal_seq); -+ -+ /* -+ * If the btree updates referring to a bucket weren't flushed -+ * before the bucket became empty again, then the we don't have -+ * to wait on a journal flush before we can reuse the bucket: -+ */ -+ v->journal_seq = bucket_journal_seq = -+ data_type_is_empty(new_a->data_type) && -+ (journal_seq == v->journal_seq || -+ bch2_journal_noflush_seq(&c->journal, v->journal_seq)) -+ ? 0 : journal_seq; -+ } -+ -+ if (!data_type_is_empty(old_a->data_type) && -+ data_type_is_empty(new_a->data_type) && -+ bucket_journal_seq) { -+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, -+ c->journal.flushed_seq_ondisk, -+ new.k->p.inode, new.k->p.offset, -+ bucket_journal_seq); -+ if (ret) { -+ bch2_fs_fatal_error(c, -+ "error setting bucket_needs_journal_commit: %i", ret); -+ return ret; -+ } -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ if (!gc && new_a->gen != old_a->gen) -+ *bucket_gen(ca, new.k->p.offset) = new_a->gen; -+ -+ bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); -+ -+ if (gc) { -+ struct bucket *g = gc_bucket(ca, new.k->p.offset); -+ -+ bucket_lock(g); -+ -+ g->gen_valid = 1; -+ g->gen = new_a->gen; -+ g->data_type = new_a->data_type; -+ g->stripe = new_a->stripe; -+ g->stripe_redundancy = new_a->stripe_redundancy; -+ g->dirty_sectors = new_a->dirty_sectors; -+ g->cached_sectors = new_a->cached_sectors; -+ -+ bucket_unlock(g); -+ } -+ percpu_up_read(&c->mark_lock); -+ -+ /* -+ * need to know if we're getting called from the invalidate path or -+ * not: -+ */ -+ -+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && -+ old_a->cached_sectors) { -+ ret = update_cached_sectors(c, new, ca->dev_idx, -+ -((s64) old_a->cached_sectors), -+ journal_seq, gc); -+ if (ret) { -+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", -+ __func__); -+ return ret; -+ } -+ } -+ -+ if (new_a->data_type == BCH_DATA_free && -+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) -+ closure_wake_up(&c->freelist_wait); -+ -+ if (new_a->data_type == BCH_DATA_need_discard && -+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) -+ bch2_do_discards(c); -+ -+ if (old_a->data_type != BCH_DATA_cached && -+ new_a->data_type == BCH_DATA_cached && -+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) -+ bch2_do_invalidates(c); -+ -+ if (new_a->data_type == BCH_DATA_need_gc_gens) -+ bch2_do_gc_gens(c); -+ -+ return 0; -+} -+ -+int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type data_type, -+ unsigned sectors, struct gc_pos pos, -+ unsigned flags) -+{ -+ struct bucket old, new, *g; -+ int ret = 0; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ BUG_ON(data_type != BCH_DATA_sb && -+ data_type != BCH_DATA_journal); -+ -+ /* -+ * Backup superblock might be past the end of our normal usable space: -+ */ -+ if (b >= ca->mi.nbuckets) -+ return 0; -+ -+ percpu_down_read(&c->mark_lock); -+ g = gc_bucket(ca, b); -+ -+ bucket_lock(g); -+ old = *g; -+ -+ if (bch2_fs_inconsistent_on(g->data_type && -+ g->data_type != data_type, c, -+ "different types of data in same bucket: %s, %s", -+ bch2_data_types[g->data_type], -+ bch2_data_types[data_type])) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", -+ ca->dev_idx, b, g->gen, -+ bch2_data_types[g->data_type ?: data_type], -+ g->dirty_sectors, sectors)) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ -+ g->data_type = data_type; -+ g->dirty_sectors += sectors; -+ new = *g; -+err: -+ bucket_unlock(g); -+ if (!ret) -+ bch2_dev_usage_update_m(c, ca, old, new, 0, true); -+ percpu_up_read(&c->mark_lock); -+ return ret; -+} -+ -+static int check_bucket_ref(struct btree_trans *trans, -+ struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 b_gen, u8 bucket_data_type, -+ u32 dirty_sectors, u32 cached_sectors) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); -+ u16 bucket_sectors = !ptr->cached -+ ? dirty_sectors -+ : cached_sectors; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (bucket_data_type == BCH_DATA_cached) -+ bucket_data_type = BCH_DATA_user; -+ -+ if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || -+ (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) -+ bucket_data_type = ptr_data_type = BCH_DATA_stripe; -+ -+ if (gen_after(ptr->gen, b_gen)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, b_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, b_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (b_gen != ptr->gen && !ptr->cached) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, b_gen, -+ *bucket_gen(ca, bucket_nr), -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (b_gen != ptr->gen) { -+ ret = 1; -+ goto out; -+ } -+ -+ if (!data_type_is_empty(bucket_data_type) && -+ ptr_data_type && -+ bucket_data_type != ptr_data_type) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, b_gen, -+ bch2_data_types[bucket_data_type], -+ bch2_data_types[ptr_data_type], -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, b_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ bucket_sectors, sectors, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EIO; -+ goto err; -+ } -+out: -+ printbuf_exit(&buf); -+ return ret; -+err: -+ bch2_dump_trans_updates(trans); -+ goto out; -+} -+ -+static int mark_stripe_bucket(struct btree_trans *trans, -+ struct bkey_s_c k, -+ unsigned ptr_idx, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ u64 journal_seq = trans->journal_res.seq; -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ unsigned nr_data = s->nr_blocks - s->nr_redundant; -+ bool parity = ptr_idx >= nr_data; -+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; -+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; -+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket old, new, *g; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ /* * XXX doesn't handle deletion */ -+ -+ percpu_down_read(&c->mark_lock); -+ buf.atomic++; -+ g = PTR_GC_BUCKET(ca, ptr); -+ -+ if (g->dirty_sectors || -+ (g->stripe && g->stripe != k.k->p.offset)) { -+ bch2_fs_inconsistent(c, -+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ bucket_lock(g); -+ old = *g; -+ -+ ret = check_bucket_ref(trans, k, ptr, sectors, data_type, -+ g->gen, g->data_type, -+ g->dirty_sectors, g->cached_sectors); -+ if (ret) -+ goto err; -+ -+ g->data_type = data_type; -+ g->dirty_sectors += sectors; -+ -+ g->stripe = k.k->p.offset; -+ g->stripe_redundancy = s->nr_redundant; -+ new = *g; -+err: -+ bucket_unlock(g); -+ if (!ret) -+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); -+ percpu_up_read(&c->mark_lock); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int __mark_pointer(struct btree_trans *trans, -+ struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 *bucket_data_type, -+ u32 *dirty_sectors, u32 *cached_sectors) -+{ -+ u32 *dst_sectors = !ptr->cached -+ ? dirty_sectors -+ : cached_sectors; -+ int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, -+ bucket_gen, *bucket_data_type, -+ *dirty_sectors, *cached_sectors); -+ -+ if (ret) -+ return ret; -+ -+ *dst_sectors += sectors; -+ *bucket_data_type = *dirty_sectors || *cached_sectors -+ ? ptr_data_type : 0; -+ return 0; -+} -+ -+static int bch2_mark_pointer(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, -+ unsigned flags) -+{ -+ u64 journal_seq = trans->journal_res.seq; -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket old, new, *g; -+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); -+ u8 bucket_data_type; -+ int ret = 0; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ percpu_down_read(&c->mark_lock); -+ g = PTR_GC_BUCKET(ca, &p.ptr); -+ bucket_lock(g); -+ old = *g; -+ -+ bucket_data_type = g->data_type; -+ ret = __mark_pointer(trans, k, &p.ptr, sectors, -+ data_type, g->gen, -+ &bucket_data_type, -+ &g->dirty_sectors, -+ &g->cached_sectors); -+ if (!ret) -+ g->data_type = bucket_data_type; -+ -+ new = *g; -+ bucket_unlock(g); -+ if (!ret) -+ bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+static int bch2_mark_stripe_ptr(struct btree_trans *trans, -+ struct bkey_s_c k, -+ struct bch_extent_stripe_ptr p, -+ enum bch_data_type data_type, -+ s64 sectors, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_replicas_padded r; -+ struct gc_stripe *m; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); -+ if (!m) { -+ bch_err(c, "error allocating memory for gc_stripes, idx %llu", -+ (u64) p.idx); -+ return -BCH_ERR_ENOMEM_mark_stripe_ptr; -+ } -+ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ -+ if (!m || !m->alive) { -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ bch2_inconsistent_error(c); -+ return -EIO; -+ } -+ -+ m->block_sectors[p.block] += sectors; -+ -+ r = m->r; -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ -+ r.e.data_type = data_type; -+ update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); -+ -+ return 0; -+} -+ -+int bch2_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ u64 journal_seq = trans->journal_res.seq; -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) -+ ? BCH_DATA_btree -+ : BCH_DATA_user; -+ s64 sectors = bkey_is_btree_ptr(k.k) -+ ? btree_sectors(c) -+ : k.k->size; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = ptr_disk_sectors(sectors, p); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE) -+ disk_sectors = -disk_sectors; -+ -+ ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) { -+ ret = update_cached_sectors(c, k, p.ptr.dev, -+ disk_sectors, journal_seq, true); -+ if (ret) { -+ bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", -+ __func__); -+ return ret; -+ } -+ } -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, -+ disk_sectors, flags); -+ if (ret) -+ return ret; -+ -+ /* -+ * There may be other dirty pointers in this extent, but -+ * if so they're not required for mounting if we have an -+ * erasure coded pointer in this extent: -+ */ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) { -+ ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); -+ if (ret) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); -+ printbuf_exit(&buf); -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+int bch2_mark_stripe(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ u64 journal_seq = trans->journal_res.seq; -+ struct bch_fs *c = trans->c; -+ u64 idx = new.k->p.offset; -+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(old).v : NULL; -+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(new).v : NULL; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(gc && old_s); -+ -+ if (!gc) { -+ struct stripe *m = genradix_ptr(&c->stripes, idx); -+ -+ if (!m) { -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf1, c, old); -+ bch2_bkey_val_to_text(&buf2, c, new); -+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" -+ "old %s\n" -+ "new %s", idx, buf1.buf, buf2.buf); -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ bch2_inconsistent_error(c); -+ return -1; -+ } -+ -+ if (!new_s) { -+ bch2_stripes_heap_del(c, m, idx); -+ -+ memset(m, 0, sizeof(*m)); -+ } else { -+ m->sectors = le16_to_cpu(new_s->sectors); -+ m->algorithm = new_s->algorithm; -+ m->nr_blocks = new_s->nr_blocks; -+ m->nr_redundant = new_s->nr_redundant; -+ m->blocks_nonempty = 0; -+ -+ for (i = 0; i < new_s->nr_blocks; i++) -+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); -+ -+ if (!old_s) -+ bch2_stripes_heap_insert(c, m, idx); -+ else -+ bch2_stripes_heap_update(c, m, idx); -+ } -+ } else { -+ struct gc_stripe *m = -+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); -+ -+ if (!m) { -+ bch_err(c, "error allocating memory for gc_stripes, idx %llu", -+ idx); -+ return -BCH_ERR_ENOMEM_mark_stripe; -+ } -+ /* -+ * This will be wrong when we bring back runtime gc: we should -+ * be unmarking the old key and then marking the new key -+ */ -+ m->alive = true; -+ m->sectors = le16_to_cpu(new_s->sectors); -+ m->nr_blocks = new_s->nr_blocks; -+ m->nr_redundant = new_s->nr_redundant; -+ -+ for (i = 0; i < new_s->nr_blocks; i++) -+ m->ptrs[i] = new_s->ptrs[i]; -+ -+ bch2_bkey_to_replicas(&m->r.e, new); -+ -+ /* -+ * gc recalculates this field from stripe ptr -+ * references: -+ */ -+ memset(m->block_sectors, 0, sizeof(m->block_sectors)); -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ ret = mark_stripe_bucket(trans, new, i, flags); -+ if (ret) -+ return ret; -+ } -+ -+ ret = update_replicas(c, new, &m->r.e, -+ ((s64) m->sectors * m->nr_redundant), -+ journal_seq, gc); -+ if (ret) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, new); -+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); -+ printbuf_exit(&buf); -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+int bch2_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; -+ struct bch_fs_usage *fs_usage; -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ s64 sectors = (s64) k.k->size; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE) -+ sectors = -sectors; -+ sectors *= replicas; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ -+ fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(fs_usage->persistent_reserved)); -+ -+ fs_usage->reserved += sectors; -+ fs_usage->persistent_reserved[replicas - 1] += sectors; -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ -+ return 0; -+} -+ -+static s64 __bch2_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 start, u64 end, -+ u64 *idx, unsigned flags, size_t r_idx) -+{ -+ struct bch_fs *c = trans->c; -+ struct reflink_gc *r; -+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; -+ u64 next_idx = end; -+ s64 ret = 0; -+ struct printbuf buf = PRINTBUF; -+ -+ if (r_idx >= c->reflink_gc_nr) -+ goto not_found; -+ -+ r = genradix_ptr(&c->reflink_gc_table, r_idx); -+ next_idx = min(next_idx, r->offset - r->size); -+ if (*idx < next_idx) -+ goto not_found; -+ -+ BUG_ON((s64) r->refcount + add < 0); -+ -+ r->refcount += add; -+ *idx = r->offset; -+ return 0; -+not_found: -+ if (fsck_err(c, "pointer to missing indirect extent\n" -+ " %s\n" -+ " missing range %llu-%llu", -+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), -+ *idx, next_idx)) { -+ struct bkey_i_error *new; -+ -+ new = bch2_trans_kmalloc(trans, sizeof(*new)); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ goto err; -+ -+ bkey_init(&new->k); -+ new->k.type = KEY_TYPE_error; -+ new->k.p = bkey_start_pos(p.k); -+ new->k.p.offset += *idx - start; -+ bch2_key_resize(&new->k, next_idx - *idx); -+ ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i, -+ BTREE_TRIGGER_NORUN); -+ } -+ -+ *idx = next_idx; -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ struct reflink_gc *ref; -+ size_t l, r, m; -+ u64 idx = le64_to_cpu(p.v->idx), start = idx; -+ u64 end = le64_to_cpu(p.v->idx) + p.k->size; -+ int ret = 0; -+ -+ BUG_ON(!(flags & BTREE_TRIGGER_GC)); -+ -+ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) { -+ idx -= le32_to_cpu(p.v->front_pad); -+ end += le32_to_cpu(p.v->back_pad); -+ } -+ -+ l = 0; -+ r = c->reflink_gc_nr; -+ while (l < r) { -+ m = l + (r - l) / 2; -+ -+ ref = genradix_ptr(&c->reflink_gc_table, m); -+ if (ref->offset <= idx) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ while (idx < end && !ret) -+ ret = __bch2_mark_reflink_p(trans, p, start, end, -+ &idx, flags, l++); -+ -+ return ret; -+} -+ -+void bch2_trans_fs_usage_revert(struct btree_trans *trans, -+ struct replicas_delta_list *deltas) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_fs_usage *dst; -+ struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; -+ s64 added = 0; -+ unsigned i; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ dst = fs_usage_ptr(c, trans->journal_res.seq, false); -+ -+ /* revert changes: */ -+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) { -+ switch (d->r.data_type) { -+ case BCH_DATA_btree: -+ case BCH_DATA_user: -+ case BCH_DATA_parity: -+ added += d->delta; -+ } -+ BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); -+ } -+ -+ dst->nr_inodes -= deltas->nr_inodes; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ added -= deltas->persistent_reserved[i]; -+ dst->reserved -= deltas->persistent_reserved[i]; -+ dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; -+ } -+ -+ if (added > 0) { -+ trans->disk_res->sectors += added; -+ this_cpu_add(*c->online_reserved, added); -+ } -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+} -+ -+int bch2_trans_fs_usage_apply(struct btree_trans *trans, -+ struct replicas_delta_list *deltas) -+{ -+ struct bch_fs *c = trans->c; -+ static int warned_disk_usage = 0; -+ bool warn = false; -+ unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; -+ struct replicas_delta *d = deltas->d, *d2; -+ struct replicas_delta *top = (void *) deltas->d + deltas->used; -+ struct bch_fs_usage *dst; -+ s64 added = 0, should_not_have_added; -+ unsigned i; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ dst = fs_usage_ptr(c, trans->journal_res.seq, false); -+ -+ for (d = deltas->d; d != top; d = replicas_delta_next(d)) { -+ switch (d->r.data_type) { -+ case BCH_DATA_btree: -+ case BCH_DATA_user: -+ case BCH_DATA_parity: -+ added += d->delta; -+ } -+ -+ if (__update_replicas(c, dst, &d->r, d->delta)) -+ goto need_mark; -+ } -+ -+ dst->nr_inodes += deltas->nr_inodes; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ added += deltas->persistent_reserved[i]; -+ dst->reserved += deltas->persistent_reserved[i]; -+ dst->persistent_reserved[i] += deltas->persistent_reserved[i]; -+ } -+ -+ /* -+ * Not allowed to reduce sectors_available except by getting a -+ * reservation: -+ */ -+ should_not_have_added = added - (s64) disk_res_sectors; -+ if (unlikely(should_not_have_added > 0)) { -+ u64 old, new, v = atomic64_read(&c->sectors_available); -+ -+ do { -+ old = v; -+ new = max_t(s64, 0, old - should_not_have_added); -+ } while ((v = atomic64_cmpxchg(&c->sectors_available, -+ old, new)) != old); -+ -+ added -= should_not_have_added; -+ warn = true; -+ } -+ -+ if (added > 0) { -+ trans->disk_res->sectors -= added; -+ this_cpu_sub(*c->online_reserved, added); -+ } -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ -+ if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) -+ bch2_trans_inconsistent(trans, -+ "disk usage increased %lli more than %u sectors reserved)", -+ should_not_have_added, disk_res_sectors); -+ return 0; -+need_mark: -+ /* revert changes: */ -+ for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) -+ BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ return -1; -+} -+ -+/* trans_mark: */ -+ -+static inline int bch2_trans_mark_pointer(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ unsigned flags) -+{ -+ bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); -+ struct btree_iter iter; -+ struct bkey_i_alloc_v4 *a; -+ struct bpos bucket; -+ struct bch_backpointer bp; -+ s64 sectors; -+ int ret; -+ -+ bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); -+ sectors = bp.bucket_len; -+ if (!insert) -+ sectors = -sectors; -+ -+ a = bch2_trans_start_alloc_update(trans, &iter, bucket); -+ if (IS_ERR(a)) -+ return PTR_ERR(a); -+ -+ ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, -+ a->v.gen, &a->v.data_type, -+ &a->v.dirty_sectors, &a->v.cached_sectors) ?: -+ bch2_trans_update(trans, &iter, &a->k_i, 0); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) -+ return ret; -+ -+ if (!p.ptr.cached) { -+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type) -+{ -+ struct btree_iter iter; -+ struct bkey_i_stripe *s; -+ struct bch_replicas_padded r; -+ int ret = 0; -+ -+ s = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_stripes, POS(0, p.ec.idx), -+ BTREE_ITER_WITH_UPDATES, stripe); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (unlikely(ret)) { -+ bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, -+ "pointer to nonexistent stripe %llu", -+ (u64) p.ec.idx); -+ goto err; -+ } -+ -+ if (!bch2_ptr_matches_stripe(&s->v, p)) { -+ bch2_trans_inconsistent(trans, -+ "stripe pointer doesn't match stripe %llu", -+ (u64) p.ec.idx); -+ ret = -EIO; -+ goto err; -+ } -+ -+ stripe_blockcount_set(&s->v, p.ec.block, -+ stripe_blockcount_get(&s->v, p.ec.block) + -+ sectors); -+ -+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); -+ r.e.data_type = data_type; -+ ret = update_replicas_list(trans, &r.e, sectors); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_trans_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ enum bch_data_type data_type = bkey_is_btree_ptr(k.k) -+ ? BCH_DATA_btree -+ : BCH_DATA_user; -+ s64 sectors = bkey_is_btree_ptr(k.k) -+ ? btree_sectors(c) -+ : k.k->size; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret = 0; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = ptr_disk_sectors(sectors, p); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE) -+ disk_sectors = -disk_sectors; -+ -+ ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) { -+ ret = update_cached_sectors_list(trans, p.ptr.dev, -+ disk_sectors); -+ if (ret) -+ return ret; -+ } -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ ret = bch2_trans_mark_stripe_ptr(trans, p, -+ disk_sectors, data_type); -+ if (ret) -+ return ret; -+ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ ret = update_replicas_list(trans, &r.e, dirty_sectors); -+ -+ return ret; -+} -+ -+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, -+ struct bkey_s_c_stripe s, -+ unsigned idx, bool deleting) -+{ -+ struct bch_fs *c = trans->c; -+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; -+ struct btree_iter iter; -+ struct bkey_i_alloc_v4 *a; -+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant -+ ? BCH_DATA_parity : 0; -+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; -+ int ret = 0; -+ -+ if (deleting) -+ sectors = -sectors; -+ -+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); -+ if (IS_ERR(a)) -+ return PTR_ERR(a); -+ -+ ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, -+ a->v.gen, a->v.data_type, -+ a->v.dirty_sectors, a->v.cached_sectors); -+ if (ret) -+ goto err; -+ -+ if (!deleting) { -+ if (bch2_trans_inconsistent_on(a->v.stripe || -+ a->v.stripe_redundancy, trans, -+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", -+ iter.pos.inode, iter.pos.offset, a->v.gen, -+ bch2_data_types[a->v.data_type], -+ a->v.dirty_sectors, -+ a->v.stripe, s.k->p.offset)) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, -+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", -+ iter.pos.inode, iter.pos.offset, a->v.gen, -+ bch2_data_types[a->v.data_type], -+ a->v.dirty_sectors, -+ s.k->p.offset)) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ a->v.stripe = s.k->p.offset; -+ a->v.stripe_redundancy = s.v->nr_redundant; -+ a->v.data_type = BCH_DATA_stripe; -+ } else { -+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || -+ a->v.stripe_redundancy != s.v->nr_redundant, trans, -+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", -+ iter.pos.inode, iter.pos.offset, a->v.gen, -+ s.k->p.offset, a->v.stripe)) { -+ ret = -EIO; -+ goto err; -+ } -+ -+ a->v.stripe = 0; -+ a->v.stripe_redundancy = 0; -+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); -+ } -+ -+ a->v.dirty_sectors += sectors; -+ if (data_type) -+ a->v.data_type = !deleting ? data_type : 0; -+ -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -+ if (ret) -+ goto err; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_trans_mark_stripe(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ const struct bch_stripe *old_s = NULL; -+ struct bch_stripe *new_s = NULL; -+ struct bch_replicas_padded r; -+ unsigned i, nr_blocks; -+ int ret = 0; -+ -+ if (old.k->type == KEY_TYPE_stripe) -+ old_s = bkey_s_c_to_stripe(old).v; -+ if (new->k.type == KEY_TYPE_stripe) -+ new_s = &bkey_i_to_stripe(new)->v; -+ -+ /* -+ * If the pointers aren't changing, we don't need to do anything: -+ */ -+ if (new_s && old_s && -+ new_s->nr_blocks == old_s->nr_blocks && -+ new_s->nr_redundant == old_s->nr_redundant && -+ !memcmp(old_s->ptrs, new_s->ptrs, -+ new_s->nr_blocks * sizeof(struct bch_extent_ptr))) -+ return 0; -+ -+ BUG_ON(new_s && old_s && -+ (new_s->nr_blocks != old_s->nr_blocks || -+ new_s->nr_redundant != old_s->nr_redundant)); -+ -+ nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; -+ -+ if (new_s) { -+ s64 sectors = le16_to_cpu(new_s->sectors); -+ -+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); -+ ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); -+ if (ret) -+ return ret; -+ } -+ -+ if (old_s) { -+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); -+ -+ bch2_bkey_to_replicas(&r.e, old); -+ ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); -+ if (ret) -+ return ret; -+ } -+ -+ for (i = 0; i < nr_blocks; i++) { -+ if (new_s && old_s && -+ !memcmp(&new_s->ptrs[i], -+ &old_s->ptrs[i], -+ sizeof(new_s->ptrs[i]))) -+ continue; -+ -+ if (new_s) { -+ ret = bch2_trans_mark_stripe_bucket(trans, -+ bkey_i_to_s_c_stripe(new), i, false); -+ if (ret) -+ break; -+ } -+ -+ if (old_s) { -+ ret = bch2_trans_mark_stripe_bucket(trans, -+ bkey_s_c_to_stripe(old), i, true); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+int bch2_trans_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ s64 sectors = (s64) k.k->size; -+ struct replicas_delta_list *d; -+ int ret; -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE) -+ sectors = -sectors; -+ sectors *= replicas; -+ -+ ret = bch2_replicas_deltas_realloc(trans, 0); -+ if (ret) -+ return ret; -+ -+ d = trans->fs_usage_deltas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(d->persistent_reserved)); -+ -+ d->persistent_reserved[replicas - 1] += sectors; -+ return 0; -+} -+ -+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 *idx, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_i *k; -+ __le64 *refcount; -+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ k = bch2_bkey_get_mut_noupdate(trans, &iter, -+ BTREE_ID_reflink, POS(0, *idx), -+ BTREE_ITER_WITH_UPDATES); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (ret) -+ goto err; -+ -+ refcount = bkey_refcount(k); -+ if (!refcount) { -+ bch2_bkey_val_to_text(&buf, c, p.s_c); -+ bch2_trans_inconsistent(trans, -+ "nonexistent indirect extent at %llu while marking\n %s", -+ *idx, buf.buf); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { -+ bch2_bkey_val_to_text(&buf, c, p.s_c); -+ bch2_trans_inconsistent(trans, -+ "indirect extent refcount underflow at %llu while marking\n %s", -+ *idx, buf.buf); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; -+ u64 pad; -+ -+ pad = max_t(s64, le32_to_cpu(v->front_pad), -+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); -+ BUG_ON(pad > U32_MAX); -+ v->front_pad = cpu_to_le32(pad); -+ -+ pad = max_t(s64, le32_to_cpu(v->back_pad), -+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); -+ BUG_ON(pad > U32_MAX); -+ v->back_pad = cpu_to_le32(pad); -+ } -+ -+ le64_add_cpu(refcount, add); -+ -+ bch2_btree_iter_set_pos_to_extent_start(&iter); -+ ret = bch2_trans_update(trans, &iter, k, 0); -+ if (ret) -+ goto err; -+ -+ *idx = k->k.p.offset; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE -+ ? old -+ : bkey_i_to_s_c(new); -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx, end_idx; -+ int ret = 0; -+ -+ if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; -+ -+ v->front_pad = v->back_pad = 0; -+ } -+ -+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); -+ end_idx = le64_to_cpu(p.v->idx) + p.k->size + -+ le32_to_cpu(p.v->back_pad); -+ -+ while (idx < end_idx && !ret) -+ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); -+ -+ return ret; -+} -+ -+static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, -+ struct bch_dev *ca, size_t b, -+ enum bch_data_type type, -+ unsigned sectors) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_i_alloc_v4 *a; -+ int ret = 0; -+ -+ /* -+ * Backup superblock might be past the end of our normal usable space: -+ */ -+ if (b >= ca->mi.nbuckets) -+ return 0; -+ -+ a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); -+ if (IS_ERR(a)) -+ return PTR_ERR(a); -+ -+ if (a->v.data_type && type && a->v.data_type != type) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ iter.pos.inode, iter.pos.offset, a->v.gen, -+ bch2_data_types[a->v.data_type], -+ bch2_data_types[type], -+ bch2_data_types[type]); -+ ret = -EIO; -+ goto out; -+ } -+ -+ a->v.data_type = type; -+ a->v.dirty_sectors = sectors; -+ -+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0); -+ if (ret) -+ goto out; -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, -+ struct bch_dev *ca, size_t b, -+ enum bch_data_type type, -+ unsigned sectors) -+{ -+ return commit_do(trans, NULL, NULL, 0, -+ __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); -+} -+ -+static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, -+ struct bch_dev *ca, -+ u64 start, u64 end, -+ enum bch_data_type type, -+ u64 *bucket, unsigned *bucket_sectors) -+{ -+ do { -+ u64 b = sector_to_bucket(ca, start); -+ unsigned sectors = -+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; -+ -+ if (b != *bucket && *bucket_sectors) { -+ int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, -+ type, *bucket_sectors); -+ if (ret) -+ return ret; -+ -+ *bucket_sectors = 0; -+ } -+ -+ *bucket = b; -+ *bucket_sectors += sectors; -+ start += sectors; -+ } while (start < end); -+ -+ return 0; -+} -+ -+static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, -+ struct bch_dev *ca) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ u64 bucket = 0; -+ unsigned i, bucket_sectors = 0; -+ int ret; -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset == BCH_SB_SECTOR) { -+ ret = bch2_trans_mark_metadata_sectors(trans, ca, -+ 0, BCH_SB_SECTOR, -+ BCH_DATA_sb, &bucket, &bucket_sectors); -+ if (ret) -+ return ret; -+ } -+ -+ ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, -+ offset + (1 << layout->sb_max_size_bits), -+ BCH_DATA_sb, &bucket, &bucket_sectors); -+ if (ret) -+ return ret; -+ } -+ -+ if (bucket_sectors) { -+ ret = bch2_trans_mark_metadata_bucket(trans, ca, -+ bucket, BCH_DATA_sb, bucket_sectors); -+ if (ret) -+ return ret; -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) { -+ ret = bch2_trans_mark_metadata_bucket(trans, ca, -+ ca->journal.buckets[i], -+ BCH_DATA_journal, ca->mi.bucket_size); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) -+{ -+ int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* Disk reservations: */ -+ -+#define SECTORS_CACHE 1024 -+ -+int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, -+ u64 sectors, int flags) -+{ -+ struct bch_fs_pcpu *pcpu; -+ u64 old, v, get; -+ s64 sectors_available; -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ pcpu = this_cpu_ptr(c->pcpu); -+ -+ if (sectors <= pcpu->sectors_available) -+ goto out; -+ -+ v = atomic64_read(&c->sectors_available); -+ do { -+ old = v; -+ get = min((u64) sectors + SECTORS_CACHE, old); -+ -+ if (get < sectors) { -+ preempt_enable(); -+ goto recalculate; -+ } -+ } while ((v = atomic64_cmpxchg(&c->sectors_available, -+ old, old - get)) != old); -+ -+ pcpu->sectors_available += get; -+ -+out: -+ pcpu->sectors_available -= sectors; -+ this_cpu_add(*c->online_reserved, sectors); -+ res->sectors += sectors; -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ return 0; -+ -+recalculate: -+ mutex_lock(&c->sectors_available_lock); -+ -+ percpu_u64_set(&c->pcpu->sectors_available, 0); -+ sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); -+ -+ if (sectors <= sectors_available || -+ (flags & BCH_DISK_RESERVATION_NOFAIL)) { -+ atomic64_set(&c->sectors_available, -+ max_t(s64, 0, sectors_available - sectors)); -+ this_cpu_add(*c->online_reserved, sectors); -+ res->sectors += sectors; -+ ret = 0; -+ } else { -+ atomic64_set(&c->sectors_available, sectors_available); -+ ret = -BCH_ERR_ENOSPC_disk_reservation; -+ } -+ -+ mutex_unlock(&c->sectors_available_lock); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+/* Startup/shutdown: */ -+ -+static void bucket_gens_free_rcu(struct rcu_head *rcu) -+{ -+ struct bucket_gens *buckets = -+ container_of(rcu, struct bucket_gens, rcu); -+ -+ kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; -+ unsigned long *buckets_nouse = NULL; -+ bool resize = ca->bucket_gens != NULL; -+ int ret; -+ -+ if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, -+ GFP_KERNEL|__GFP_ZERO))) { -+ ret = -BCH_ERR_ENOMEM_bucket_gens; -+ goto err; -+ } -+ -+ if ((c->opts.buckets_nouse && -+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * -+ sizeof(unsigned long), -+ GFP_KERNEL|__GFP_ZERO)))) { -+ ret = -BCH_ERR_ENOMEM_buckets_nouse; -+ goto err; -+ } -+ -+ bucket_gens->first_bucket = ca->mi.first_bucket; -+ bucket_gens->nbuckets = nbuckets; -+ -+ bch2_copygc_stop(c); -+ -+ if (resize) { -+ down_write(&c->gc_lock); -+ down_write(&ca->bucket_lock); -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); -+ -+ if (resize) { -+ size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); -+ -+ memcpy(bucket_gens->b, -+ old_bucket_gens->b, -+ n); -+ if (buckets_nouse) -+ memcpy(buckets_nouse, -+ ca->buckets_nouse, -+ BITS_TO_LONGS(n) * sizeof(unsigned long)); -+ } -+ -+ rcu_assign_pointer(ca->bucket_gens, bucket_gens); -+ bucket_gens = old_bucket_gens; -+ -+ swap(ca->buckets_nouse, buckets_nouse); -+ -+ nbuckets = ca->mi.nbuckets; -+ -+ if (resize) { -+ percpu_up_write(&c->mark_lock); -+ up_write(&ca->bucket_lock); -+ up_write(&c->gc_lock); -+ } -+ -+ ret = 0; -+err: -+ kvpfree(buckets_nouse, -+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); -+ if (bucket_gens) -+ call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); -+ -+ return ret; -+} -+ -+void bch2_dev_buckets_free(struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ kvpfree(ca->buckets_nouse, -+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); -+ kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), -+ sizeof(struct bucket_gens) + ca->mi.nbuckets); -+ -+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) -+ free_percpu(ca->usage[i]); -+ kfree(ca->usage_base); -+} -+ -+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); -+ if (!ca->usage_base) -+ return -BCH_ERR_ENOMEM_usage_init; -+ -+ for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { -+ ca->usage[i] = alloc_percpu(struct bch_dev_usage); -+ if (!ca->usage[i]) -+ return -BCH_ERR_ENOMEM_usage_init; -+ } -+ -+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); -+} -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -new file mode 100644 -index 000000000..f192809f5 ---- /dev/null -+++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,413 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ */ -+ -+#ifndef _BUCKETS_H -+#define _BUCKETS_H -+ -+#include "buckets_types.h" -+#include "extents.h" -+#include "sb-members.h" -+ -+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -+{ -+ return div_u64(s, ca->mi.bucket_size); -+} -+ -+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -+{ -+ return ((sector_t) b) * ca->mi.bucket_size; -+} -+ -+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -+{ -+ u32 remainder; -+ -+ div_u64_rem(s, ca->mi.bucket_size, &remainder); -+ return remainder; -+} -+ -+static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, -+ u32 *offset) -+{ -+ return div_u64_rem(s, ca->mi.bucket_size, offset); -+} -+ -+#define for_each_bucket(_b, _buckets) \ -+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ -+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -+ -+static inline void bucket_unlock(struct bucket *b) -+{ -+ smp_store_release(&b->lock, 0); -+} -+ -+static inline void bucket_lock(struct bucket *b) -+{ -+ while (xchg(&b->lock, 1)) -+ cpu_relax(); -+} -+ -+static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) -+{ -+ return rcu_dereference_check(ca->buckets_gc, -+ !ca->fs || -+ percpu_rwsem_is_held(&ca->fs->mark_lock) || -+ lockdep_is_held(&ca->fs->gc_lock) || -+ lockdep_is_held(&ca->bucket_lock)); -+} -+ -+static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) -+{ -+ struct bucket_array *buckets = gc_bucket_array(ca); -+ -+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); -+ return buckets->b + b; -+} -+ -+static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) -+{ -+ return rcu_dereference_check(ca->bucket_gens, -+ !ca->fs || -+ percpu_rwsem_is_held(&ca->fs->mark_lock) || -+ lockdep_is_held(&ca->fs->gc_lock) || -+ lockdep_is_held(&ca->bucket_lock)); -+} -+ -+static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) -+{ -+ struct bucket_gens *gens = bucket_gens(ca); -+ -+ BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); -+ return gens->b + b; -+} -+ -+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return sector_to_bucket(ca, ptr->offset); -+} -+ -+static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, -+ const struct bch_extent_ptr *ptr) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -+} -+ -+static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, -+ const struct bch_extent_ptr *ptr, -+ u32 *bucket_offset) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); -+} -+ -+static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); -+} -+ -+static inline enum bch_data_type ptr_data_type(const struct bkey *k, -+ const struct bch_extent_ptr *ptr) -+{ -+ if (bkey_is_btree_ptr(k)) -+ return BCH_DATA_btree; -+ -+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -+} -+ -+static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) -+{ -+ EBUG_ON(sectors < 0); -+ -+ return crc_is_compressed(p.crc) -+ ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, -+ p.crc.uncompressed_size) -+ : sectors; -+} -+ -+static inline int gen_cmp(u8 a, u8 b) -+{ -+ return (s8) (a - b); -+} -+ -+static inline int gen_after(u8 a, u8 b) -+{ -+ int r = gen_cmp(a, b); -+ -+ return r > 0 ? r : 0; -+} -+ -+/** -+ * ptr_stale() - check if a pointer points into a bucket that has been -+ * invalidated. -+ */ -+static inline u8 ptr_stale(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ u8 ret; -+ -+ rcu_read_lock(); -+ ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+/* Device usage: */ -+ -+void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); -+static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -+{ -+ struct bch_dev_usage ret; -+ -+ bch2_dev_usage_read_fast(ca, &ret); -+ return ret; -+} -+ -+void bch2_dev_usage_init(struct bch_dev *); -+ -+static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) -+{ -+ s64 reserved = 0; -+ -+ switch (watermark) { -+ case BCH_WATERMARK_NR: -+ unreachable(); -+ case BCH_WATERMARK_stripe: -+ reserved += ca->mi.nbuckets >> 6; -+ fallthrough; -+ case BCH_WATERMARK_normal: -+ reserved += ca->mi.nbuckets >> 6; -+ fallthrough; -+ case BCH_WATERMARK_copygc: -+ reserved += ca->nr_btree_reserve; -+ fallthrough; -+ case BCH_WATERMARK_btree: -+ reserved += ca->nr_btree_reserve; -+ fallthrough; -+ case BCH_WATERMARK_btree_copygc: -+ case BCH_WATERMARK_reclaim: -+ break; -+ } -+ -+ return reserved; -+} -+ -+static inline u64 dev_buckets_free(struct bch_dev *ca, -+ struct bch_dev_usage usage, -+ enum bch_watermark watermark) -+{ -+ return max_t(s64, 0, -+ usage.d[BCH_DATA_free].buckets - -+ ca->nr_open_buckets - -+ bch2_dev_buckets_reserved(ca, watermark)); -+} -+ -+static inline u64 __dev_buckets_available(struct bch_dev *ca, -+ struct bch_dev_usage usage, -+ enum bch_watermark watermark) -+{ -+ return max_t(s64, 0, -+ usage.d[BCH_DATA_free].buckets -+ + usage.d[BCH_DATA_cached].buckets -+ + usage.d[BCH_DATA_need_gc_gens].buckets -+ + usage.d[BCH_DATA_need_discard].buckets -+ - ca->nr_open_buckets -+ - bch2_dev_buckets_reserved(ca, watermark)); -+} -+ -+static inline u64 dev_buckets_available(struct bch_dev *ca, -+ enum bch_watermark watermark) -+{ -+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); -+} -+ -+/* Filesystem usage: */ -+ -+static inline unsigned __fs_usage_u64s(unsigned nr_replicas) -+{ -+ return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; -+} -+ -+static inline unsigned fs_usage_u64s(struct bch_fs *c) -+{ -+ return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); -+} -+ -+static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) -+{ -+ return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; -+} -+ -+static inline unsigned fs_usage_online_u64s(struct bch_fs *c) -+{ -+ return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); -+} -+ -+static inline unsigned dev_usage_u64s(void) -+{ -+ return sizeof(struct bch_dev_usage) / sizeof(u64); -+} -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -+ -+struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); -+ -+void bch2_fs_usage_to_text(struct printbuf *, -+ struct bch_fs *, struct bch_fs_usage_online *); -+ -+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *); -+ -+/* key/bucket marking: */ -+ -+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, -+ unsigned journal_seq, -+ bool gc) -+{ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ BUG_ON(!gc && !journal_seq); -+ -+ return this_cpu_ptr(gc -+ ? c->usage_gc -+ : c->usage[journal_seq & JOURNAL_BUF_MASK]); -+} -+ -+int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned); -+ -+void bch2_fs_usage_initialize(struct bch_fs *); -+ -+int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, enum bch_data_type, unsigned, -+ struct gc_pos, unsigned); -+ -+int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+ -+int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -+int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -+int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -+int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); -+ -+void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); -+int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); -+ -+int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, -+ size_t, enum bch_data_type, unsigned); -+int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); -+ -+static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ u64 b_offset = bucket_to_sector(ca, b); -+ u64 b_end = bucket_to_sector(ca, b + 1); -+ unsigned i; -+ -+ if (!b) -+ return true; -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ u64 end = offset + (1 << layout->sb_max_size_bits); -+ -+ if (!(offset >= b_end || end <= b_offset)) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* disk reservations: */ -+ -+static inline void bch2_disk_reservation_put(struct bch_fs *c, -+ struct disk_reservation *res) -+{ -+ if (res->sectors) { -+ this_cpu_sub(*c->online_reserved, res->sectors); -+ res->sectors = 0; -+ } -+} -+ -+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -+ -+int __bch2_disk_reservation_add(struct bch_fs *, -+ struct disk_reservation *, -+ u64, int); -+ -+static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, -+ u64 sectors, int flags) -+{ -+#ifdef __KERNEL__ -+ u64 old, new; -+ -+ do { -+ old = this_cpu_read(c->pcpu->sectors_available); -+ if (sectors > old) -+ return __bch2_disk_reservation_add(c, res, sectors, flags); -+ -+ new = old - sectors; -+ } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); -+ -+ this_cpu_add(*c->online_reserved, sectors); -+ res->sectors += sectors; -+ return 0; -+#else -+ return __bch2_disk_reservation_add(c, res, sectors, flags); -+#endif -+} -+ -+static inline struct disk_reservation -+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -+{ -+ return (struct disk_reservation) { -+ .sectors = 0, -+#if 0 -+ /* not used yet: */ -+ .gen = c->capacity_gen, -+#endif -+ .nr_replicas = nr_replicas, -+ }; -+} -+ -+static inline int bch2_disk_reservation_get(struct bch_fs *c, -+ struct disk_reservation *res, -+ u64 sectors, unsigned nr_replicas, -+ int flags) -+{ -+ *res = bch2_disk_reservation_init(c, nr_replicas); -+ -+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -+} -+ -+#define RESERVE_FACTOR 6 -+ -+static inline u64 avail_factor(u64 r) -+{ -+ return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -+void bch2_dev_buckets_free(struct bch_dev *); -+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); -+ -+#endif /* _BUCKETS_H */ -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -new file mode 100644 -index 000000000..2a9dab900 ---- /dev/null -+++ b/fs/bcachefs/buckets_types.h -@@ -0,0 +1,92 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_TYPES_H -+#define _BUCKETS_TYPES_H -+ -+#include "bcachefs_format.h" -+#include "util.h" -+ -+#define BUCKET_JOURNAL_SEQ_BITS 16 -+ -+struct bucket { -+ u8 lock; -+ u8 gen_valid:1; -+ u8 data_type:7; -+ u8 gen; -+ u8 stripe_redundancy; -+ u32 stripe; -+ u32 dirty_sectors; -+ u32 cached_sectors; -+}; -+ -+struct bucket_array { -+ struct rcu_head rcu; -+ u16 first_bucket; -+ size_t nbuckets; -+ struct bucket b[]; -+}; -+ -+struct bucket_gens { -+ struct rcu_head rcu; -+ u16 first_bucket; -+ size_t nbuckets; -+ u8 b[]; -+}; -+ -+struct bch_dev_usage { -+ u64 buckets_ec; -+ -+ struct { -+ u64 buckets; -+ u64 sectors; /* _compressed_ sectors: */ -+ /* -+ * XXX -+ * Why do we have this? Isn't it just buckets * bucket_size - -+ * sectors? -+ */ -+ u64 fragmented; -+ } d[BCH_DATA_NR]; -+}; -+ -+struct bch_fs_usage { -+ /* all fields are in units of 512 byte sectors: */ -+ u64 hidden; -+ u64 btree; -+ u64 data; -+ u64 cached; -+ u64 reserved; -+ u64 nr_inodes; -+ -+ /* XXX: add stats for compression ratio */ -+#if 0 -+ u64 uncompressed; -+ u64 compressed; -+#endif -+ -+ /* broken out: */ -+ -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ u64 replicas[]; -+}; -+ -+struct bch_fs_usage_online { -+ u64 online_reserved; -+ struct bch_fs_usage u; -+}; -+ -+struct bch_fs_usage_short { -+ u64 capacity; -+ u64 used; -+ u64 free; -+ u64 nr_inodes; -+}; -+ -+/* -+ * A reservation for space on disk: -+ */ -+struct disk_reservation { -+ u64 sectors; -+ u32 gen; -+ unsigned nr_replicas; -+}; -+ -+#endif /* _BUCKETS_TYPES_H */ -diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c -new file mode 100644 -index 000000000..81ab685cd ---- /dev/null -+++ b/fs/bcachefs/buckets_waiting_for_journal.c -@@ -0,0 +1,166 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "buckets_waiting_for_journal.h" -+#include -+#include -+ -+static inline struct bucket_hashed * -+bucket_hash(struct buckets_waiting_for_journal_table *t, -+ unsigned hash_seed_idx, u64 dev_bucket) -+{ -+ return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); -+} -+ -+static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) -+{ -+ unsigned i; -+ -+ t->bits = bits; -+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) -+ get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); -+ memset(t->d, 0, sizeof(t->d[0]) << t->bits); -+} -+ -+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, -+ u64 flushed_seq, -+ unsigned dev, u64 bucket) -+{ -+ struct buckets_waiting_for_journal_table *t; -+ u64 dev_bucket = (u64) dev << 56 | bucket; -+ bool ret = false; -+ unsigned i; -+ -+ mutex_lock(&b->lock); -+ t = b->t; -+ -+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { -+ struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); -+ -+ if (h->dev_bucket == dev_bucket) { -+ ret = h->journal_seq > flushed_seq; -+ break; -+ } -+ } -+ -+ mutex_unlock(&b->lock); -+ -+ return ret; -+} -+ -+static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, -+ struct bucket_hashed *new, -+ u64 flushed_seq) -+{ -+ struct bucket_hashed *last_evicted = NULL; -+ unsigned tries, i; -+ -+ for (tries = 0; tries < 10; tries++) { -+ struct bucket_hashed *old, *victim = NULL; -+ -+ for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { -+ old = bucket_hash(t, i, new->dev_bucket); -+ -+ if (old->dev_bucket == new->dev_bucket || -+ old->journal_seq <= flushed_seq) { -+ *old = *new; -+ return true; -+ } -+ -+ if (last_evicted != old) -+ victim = old; -+ } -+ -+ /* hashed to same slot 3 times: */ -+ if (!victim) -+ break; -+ -+ /* Failed to find an empty slot: */ -+ swap(*new, *victim); -+ last_evicted = victim; -+ } -+ -+ return false; -+} -+ -+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, -+ u64 flushed_seq, -+ unsigned dev, u64 bucket, -+ u64 journal_seq) -+{ -+ struct buckets_waiting_for_journal_table *t, *n; -+ struct bucket_hashed tmp, new = { -+ .dev_bucket = (u64) dev << 56 | bucket, -+ .journal_seq = journal_seq, -+ }; -+ size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; -+ int ret = 0; -+ -+ mutex_lock(&b->lock); -+ -+ if (likely(bucket_table_insert(b->t, &new, flushed_seq))) -+ goto out; -+ -+ t = b->t; -+ size = 1UL << t->bits; -+ for (i = 0; i < size; i++) -+ nr_elements += t->d[i].journal_seq > flushed_seq; -+ -+ new_bits = t->bits + (nr_elements * 3 > size); -+ -+ n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); -+ if (!n) { -+ ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; -+ goto out; -+ } -+ -+retry_rehash: -+ nr_rehashes++; -+ bucket_table_init(n, new_bits); -+ -+ tmp = new; -+ BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); -+ -+ for (i = 0; i < 1UL << t->bits; i++) { -+ if (t->d[i].journal_seq <= flushed_seq) -+ continue; -+ -+ tmp = t->d[i]; -+ if (!bucket_table_insert(n, &tmp, flushed_seq)) -+ goto retry_rehash; -+ } -+ -+ b->t = n; -+ kvfree(t); -+ -+ pr_debug("took %zu rehashes, table at %zu/%zu elements", -+ nr_rehashes, nr_elements, 1UL << b->t->bits); -+out: -+ mutex_unlock(&b->lock); -+ -+ return ret; -+} -+ -+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) -+{ -+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; -+ -+ kvfree(b->t); -+} -+ -+#define INITIAL_TABLE_BITS 3 -+ -+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) -+{ -+ struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; -+ -+ mutex_init(&b->lock); -+ -+ b->t = kvmalloc(sizeof(*b->t) + -+ (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); -+ if (!b->t) -+ return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; -+ -+ bucket_table_init(b->t, INITIAL_TABLE_BITS); -+ return 0; -+} -diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h -new file mode 100644 -index 000000000..d2ae19cbe ---- /dev/null -+++ b/fs/bcachefs/buckets_waiting_for_journal.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H -+#define _BUCKETS_WAITING_FOR_JOURNAL_H -+ -+#include "buckets_waiting_for_journal_types.h" -+ -+bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, -+ u64, unsigned, u64); -+int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, -+ u64, unsigned, u64, u64); -+ -+void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); -+int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); -+ -+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ -diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h -new file mode 100644 -index 000000000..e593db061 ---- /dev/null -+++ b/fs/bcachefs/buckets_waiting_for_journal_types.h -@@ -0,0 +1,23 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H -+#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H -+ -+#include -+ -+struct bucket_hashed { -+ u64 dev_bucket; -+ u64 journal_seq; -+}; -+ -+struct buckets_waiting_for_journal_table { -+ unsigned bits; -+ u64 hash_seeds[3]; -+ struct bucket_hashed d[]; -+}; -+ -+struct buckets_waiting_for_journal { -+ struct mutex lock; -+ struct buckets_waiting_for_journal_table *t; -+}; -+ -+#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -new file mode 100644 -index 000000000..fb603df09 ---- /dev/null -+++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,769 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_CHARDEV -+ -+#include "bcachefs.h" -+#include "bcachefs_ioctl.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "journal.h" -+#include "move.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* returns with ref on ca->ref */ -+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, -+ unsigned flags) -+{ -+ struct bch_dev *ca; -+ -+ if (flags & BCH_BY_INDEX) { -+ if (dev >= c->sb.nr_devices) -+ return ERR_PTR(-EINVAL); -+ -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ if (!ca) -+ return ERR_PTR(-EINVAL); -+ } else { -+ char *path; -+ -+ path = strndup_user((const char __user *) -+ (unsigned long) dev, PATH_MAX); -+ if (IS_ERR(path)) -+ return ERR_CAST(path); -+ -+ ca = bch2_dev_lookup(c, path); -+ kfree(path); -+ } -+ -+ return ca; -+} -+ -+#if 0 -+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -+{ -+ struct bch_ioctl_assemble arg; -+ struct bch_fs *c; -+ u64 *user_devs = NULL; -+ char **devs = NULL; -+ unsigned i; -+ int ret = -EFAULT; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); -+ if (!user_devs) -+ return -ENOMEM; -+ -+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); -+ -+ if (copy_from_user(user_devs, user_arg->devs, -+ sizeof(u64) * arg.nr_devs)) -+ goto err; -+ -+ for (i = 0; i < arg.nr_devs; i++) { -+ devs[i] = strndup_user((const char __user *)(unsigned long) -+ user_devs[i], -+ PATH_MAX); -+ if (!devs[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); -+ ret = PTR_ERR_OR_ZERO(c); -+ if (!ret) -+ closure_put(&c->cl); -+err: -+ if (devs) -+ for (i = 0; i < arg.nr_devs; i++) -+ kfree(devs[i]); -+ kfree(devs); -+ return ret; -+} -+ -+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -+{ -+ struct bch_ioctl_incremental arg; -+ const char *err; -+ char *path; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ err = bch2_fs_open_incremental(path); -+ kfree(path); -+ -+ if (err) { -+ pr_err("Could not register bcachefs devices: %s", err); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+#endif -+ -+static long bch2_global_ioctl(unsigned cmd, void __user *arg) -+{ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_ASSEMBLE: -+ return bch2_ioctl_assemble(arg); -+ case BCH_IOCTL_INCREMENTAL: -+ return bch2_ioctl_incremental(arg); -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static long bch2_ioctl_query_uuid(struct bch_fs *c, -+ struct bch_ioctl_query_uuid __user *user_arg) -+{ -+ return copy_to_user(&user_arg->uuid, -+ &c->sb.user_uuid, -+ sizeof(c->sb.user_uuid)); -+} -+ -+#if 0 -+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -+{ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ return bch2_fs_start(c); -+} -+ -+static long bch2_ioctl_stop(struct bch_fs *c) -+{ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ bch2_fs_stop(c); -+ return 0; -+} -+#endif -+ -+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_add(c, path); -+ kfree(path); -+ -+ return ret; -+} -+ -+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ return bch2_dev_remove(c, ca, arg.flags); -+} -+ -+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_online(c, path); -+ kfree(path); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_offline(c, ca, arg.flags); -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_set_state(struct bch_fs *c, -+ struct bch_ioctl_disk_set_state arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad[0] || arg.pad[1] || arg.pad[2] || -+ arg.new_state >= BCH_MEMBER_STATE_NR) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); -+ if (ret) -+ bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+struct bch_data_ctx { -+ struct bch_fs *c; -+ struct bch_ioctl_data arg; -+ struct bch_move_stats stats; -+ -+ int ret; -+ -+ struct task_struct *thread; -+}; -+ -+static int bch2_data_thread(void *arg) -+{ -+ struct bch_data_ctx *ctx = arg; -+ -+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -+ -+ ctx->stats.data_type = U8_MAX; -+ return 0; -+} -+ -+static int bch2_data_job_release(struct inode *inode, struct file *file) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ -+ kthread_stop(ctx->thread); -+ put_task_struct(ctx->thread); -+ kfree(ctx); -+ return 0; -+} -+ -+static ssize_t bch2_data_job_read(struct file *file, char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ struct bch_fs *c = ctx->c; -+ struct bch_ioctl_data_event e = { -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.btree_id, -+ .p.pos = ctx->stats.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ }; -+ -+ if (len < sizeof(e)) -+ return -EINVAL; -+ -+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); -+} -+ -+static const struct file_operations bcachefs_data_ops = { -+ .release = bch2_data_job_release, -+ .read = bch2_data_job_read, -+ .llseek = no_llseek, -+}; -+ -+static long bch2_ioctl_data(struct bch_fs *c, -+ struct bch_ioctl_data arg) -+{ -+ struct bch_data_ctx *ctx = NULL; -+ struct file *file = NULL; -+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; -+ int ret, fd = -1; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) -+ return -EINVAL; -+ -+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); -+ if (!ctx) -+ return -ENOMEM; -+ -+ ctx->c = c; -+ ctx->arg = arg; -+ -+ ctx->thread = kthread_create(bch2_data_thread, ctx, -+ "bch-data/%s", c->name); -+ if (IS_ERR(ctx->thread)) { -+ ret = PTR_ERR(ctx->thread); -+ goto err; -+ } -+ -+ ret = get_unused_fd_flags(flags); -+ if (ret < 0) -+ goto err; -+ fd = ret; -+ -+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); -+ if (IS_ERR(file)) { -+ ret = PTR_ERR(file); -+ goto err; -+ } -+ -+ fd_install(fd, file); -+ -+ get_task_struct(ctx->thread); -+ wake_up_process(ctx->thread); -+ -+ return fd; -+err: -+ if (fd >= 0) -+ put_unused_fd(fd); -+ if (!IS_ERR_OR_NULL(ctx->thread)) -+ kthread_stop(ctx->thread); -+ kfree(ctx); -+ return ret; -+} -+ -+static long bch2_ioctl_fs_usage(struct bch_fs *c, -+ struct bch_ioctl_fs_usage __user *user_arg) -+{ -+ struct bch_ioctl_fs_usage *arg = NULL; -+ struct bch_replicas_usage *dst_e, *dst_end; -+ struct bch_fs_usage_online *src; -+ u32 replica_entries_bytes; -+ unsigned i; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) -+ return -EFAULT; -+ -+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); -+ if (!arg) -+ return -ENOMEM; -+ -+ src = bch2_fs_usage_read(c); -+ if (!src) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ arg->capacity = c->capacity; -+ arg->used = bch2_fs_sectors_used(c, src); -+ arg->online_reserved = src->online_reserved; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ arg->persistent_reserved[i] = src->u.persistent_reserved[i]; -+ -+ dst_e = arg->replicas; -+ dst_end = (void *) arg->replicas + replica_entries_bytes; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *src_e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ /* check that we have enough space for one replicas entry */ -+ if (dst_e + 1 > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ dst_e->sectors = src->u.replicas[i]; -+ dst_e->r = *src_e; -+ -+ /* recheck after setting nr_devs: */ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); -+ -+ dst_e = replicas_usage_next(dst_e); -+ } -+ -+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; -+ -+ percpu_up_read(&c->mark_lock); -+ kfree(src); -+ -+ if (!ret) -+ ret = copy_to_user(user_arg, arg, -+ sizeof(*arg) + arg->replica_entries_bytes); -+err: -+ kfree(arg); -+ return ret; -+} -+ -+static long bch2_ioctl_dev_usage(struct bch_fs *c, -+ struct bch_ioctl_dev_usage __user *user_arg) -+{ -+ struct bch_ioctl_dev_usage arg; -+ struct bch_dev_usage src; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad[0] || -+ arg.pad[1] || -+ arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ src = bch2_dev_usage_read(ca); -+ -+ arg.state = ca->mi.state; -+ arg.bucket_size = ca->mi.bucket_size; -+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; -+ arg.buckets_ec = src.buckets_ec; -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ arg.d[i].buckets = src.d[i].buckets; -+ arg.d[i].sectors = src.d[i].sectors; -+ arg.d[i].fragmented = src.d[i].fragmented; -+ } -+ -+ percpu_ref_put(&ca->ref); -+ -+ return copy_to_user(user_arg, &arg, sizeof(arg)); -+} -+ -+static long bch2_ioctl_read_super(struct bch_fs *c, -+ struct bch_ioctl_read_super arg) -+{ -+ struct bch_dev *ca = NULL; -+ struct bch_sb *sb; -+ int ret = 0; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || -+ arg.pad) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (arg.flags & BCH_READ_DEV) { -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ -+ if (IS_ERR(ca)) { -+ ret = PTR_ERR(ca); -+ goto err; -+ } -+ -+ sb = ca->disk_sb.sb; -+ } else { -+ sb = c->disk_sb.sb; -+ } -+ -+ if (vstruct_bytes(sb) > arg.size) { -+ ret = -ERANGE; -+ goto err; -+ } -+ -+ ret = copy_to_user((void __user *)(unsigned long)arg.sb, -+ sb, vstruct_bytes(sb)); -+err: -+ if (!IS_ERR_OR_NULL(ca)) -+ percpu_ref_put(&ca->ref); -+ mutex_unlock(&c->sb_lock); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_get_idx(struct bch_fs *c, -+ struct bch_ioctl_disk_get_idx arg) -+{ -+ dev_t dev = huge_decode_dev(arg.dev); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (!dev) -+ return -EINVAL; -+ -+ for_each_online_member(ca, c, i) -+ if (ca->dev == dev) { -+ percpu_ref_put(&ca->io_ref); -+ return i; -+ } -+ -+ return -BCH_ERR_ENOENT_dev_idx_not_found; -+} -+ -+static long bch2_ioctl_disk_resize(struct bch_fs *c, -+ struct bch_ioctl_disk_resize arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_resize(c, ca, arg.nbuckets); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, -+ struct bch_ioctl_disk_resize_journal arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+#define BCH_IOCTL(_name, _argtype) \ -+do { \ -+ _argtype i; \ -+ \ -+ if (copy_from_user(&i, arg, sizeof(i))) \ -+ return -EFAULT; \ -+ ret = bch2_ioctl_##_name(c, i); \ -+ goto out; \ -+} while (0) -+ -+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -+{ -+ long ret; -+ -+ switch (cmd) { -+ case BCH_IOCTL_QUERY_UUID: -+ return bch2_ioctl_query_uuid(c, arg); -+ case BCH_IOCTL_FS_USAGE: -+ return bch2_ioctl_fs_usage(c, arg); -+ case BCH_IOCTL_DEV_USAGE: -+ return bch2_ioctl_dev_usage(c, arg); -+#if 0 -+ case BCH_IOCTL_START: -+ BCH_IOCTL(start, struct bch_ioctl_start); -+ case BCH_IOCTL_STOP: -+ return bch2_ioctl_stop(c); -+#endif -+ case BCH_IOCTL_READ_SUPER: -+ BCH_IOCTL(read_super, struct bch_ioctl_read_super); -+ case BCH_IOCTL_DISK_GET_IDX: -+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); -+ } -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ switch (cmd) { -+ case BCH_IOCTL_DISK_ADD: -+ BCH_IOCTL(disk_add, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_REMOVE: -+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_ONLINE: -+ BCH_IOCTL(disk_online, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_OFFLINE: -+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_SET_STATE: -+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); -+ case BCH_IOCTL_DATA: -+ BCH_IOCTL(data, struct bch_ioctl_data); -+ case BCH_IOCTL_DISK_RESIZE: -+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); -+ case BCH_IOCTL_DISK_RESIZE_JOURNAL: -+ BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); -+ -+ default: -+ return -ENOTTY; -+ } -+out: -+ if (ret < 0) -+ ret = bch2_err_class(ret); -+ return ret; -+} -+ -+static DEFINE_IDR(bch_chardev_minor); -+ -+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -+{ -+ unsigned minor = iminor(file_inode(filp)); -+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; -+ void __user *arg = (void __user *) v; -+ -+ return c -+ ? bch2_fs_ioctl(c, cmd, arg) -+ : bch2_global_ioctl(cmd, arg); -+} -+ -+static const struct file_operations bch_chardev_fops = { -+ .owner = THIS_MODULE, -+ .unlocked_ioctl = bch2_chardev_ioctl, -+ .open = nonseekable_open, -+}; -+ -+static int bch_chardev_major; -+static struct class *bch_chardev_class; -+static struct device *bch_chardev; -+ -+void bch2_fs_chardev_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->chardev)) -+ device_unregister(c->chardev); -+ if (c->minor >= 0) -+ idr_remove(&bch_chardev_minor, c->minor); -+} -+ -+int bch2_fs_chardev_init(struct bch_fs *c) -+{ -+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); -+ if (c->minor < 0) -+ return c->minor; -+ -+ c->chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, c->minor), c, -+ "bcachefs%u-ctl", c->minor); -+ if (IS_ERR(c->chardev)) -+ return PTR_ERR(c->chardev); -+ -+ return 0; -+} -+ -+void bch2_chardev_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ device_destroy(bch_chardev_class, -+ MKDEV(bch_chardev_major, U8_MAX)); -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ class_destroy(bch_chardev_class); -+ if (bch_chardev_major > 0) -+ unregister_chrdev(bch_chardev_major, "bcachefs"); -+} -+ -+int __init bch2_chardev_init(void) -+{ -+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); -+ if (bch_chardev_major < 0) -+ return bch_chardev_major; -+ -+ bch_chardev_class = class_create("bcachefs"); -+ if (IS_ERR(bch_chardev_class)) -+ return PTR_ERR(bch_chardev_class); -+ -+ bch_chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, U8_MAX), -+ NULL, "bcachefs-ctl"); -+ if (IS_ERR(bch_chardev)) -+ return PTR_ERR(bch_chardev); -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_CHARDEV */ -diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h -new file mode 100644 -index 000000000..0f563ca53 ---- /dev/null -+++ b/fs/bcachefs/chardev.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHARDEV_H -+#define _BCACHEFS_CHARDEV_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); -+ -+void bch2_fs_chardev_exit(struct bch_fs *); -+int bch2_fs_chardev_init(struct bch_fs *); -+ -+void bch2_chardev_exit(void); -+int __init bch2_chardev_init(void); -+ -+#else -+ -+static inline long bch2_fs_ioctl(struct bch_fs *c, -+ unsigned cmd, void __user * arg) -+{ -+ return -ENOTTY; -+} -+ -+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } -+ -+static inline void bch2_chardev_exit(void) {} -+static inline int __init bch2_chardev_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_CHARDEV_H */ -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -new file mode 100644 -index 000000000..36939020f ---- /dev/null -+++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,753 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "errcode.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * bch2_checksum state is an abstraction of the checksum state calculated over different pages. -+ * it features page merging without having the checksum algorithm lose its state. -+ * for native checksum aglorithms (like crc), a default seed value will do. -+ * for hash-like algorithms, a state needs to be stored -+ */ -+ -+struct bch2_checksum_state { -+ union { -+ u64 seed; -+ struct xxh64_state h64state; -+ }; -+ unsigned int type; -+}; -+ -+static void bch2_checksum_init(struct bch2_checksum_state *state) -+{ -+ switch (state->type) { -+ case BCH_CSUM_none: -+ case BCH_CSUM_crc32c: -+ case BCH_CSUM_crc64: -+ state->seed = 0; -+ break; -+ case BCH_CSUM_crc32c_nonzero: -+ state->seed = U32_MAX; -+ break; -+ case BCH_CSUM_crc64_nonzero: -+ state->seed = U64_MAX; -+ break; -+ case BCH_CSUM_xxhash: -+ xxh64_reset(&state->h64state, 0); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_final(const struct bch2_checksum_state *state) -+{ -+ switch (state->type) { -+ case BCH_CSUM_none: -+ case BCH_CSUM_crc32c: -+ case BCH_CSUM_crc64: -+ return state->seed; -+ case BCH_CSUM_crc32c_nonzero: -+ return state->seed ^ U32_MAX; -+ case BCH_CSUM_crc64_nonzero: -+ return state->seed ^ U64_MAX; -+ case BCH_CSUM_xxhash: -+ return xxh64_digest(&state->h64state); -+ default: -+ BUG(); -+ } -+} -+ -+static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) -+{ -+ switch (state->type) { -+ case BCH_CSUM_none: -+ return; -+ case BCH_CSUM_crc32c_nonzero: -+ case BCH_CSUM_crc32c: -+ state->seed = crc32c(state->seed, data, len); -+ break; -+ case BCH_CSUM_crc64_nonzero: -+ case BCH_CSUM_crc64: -+ state->seed = crc64_be(state->seed, data, len); -+ break; -+ case BCH_CSUM_xxhash: -+ xxh64_update(&state->h64state, data, len); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ struct scatterlist *sg, size_t len) -+{ -+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); -+ int ret; -+ -+ skcipher_request_set_sync_tfm(req, tfm); -+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); -+ -+ ret = crypto_skcipher_encrypt(req); -+ if (ret) -+ pr_err("got error %i from crypto_skcipher_encrypt()", ret); -+ -+ return ret; -+} -+ -+static inline int do_encrypt(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ void *buf, size_t len) -+{ -+ if (!is_vmalloc_addr(buf)) { -+ struct scatterlist sg; -+ -+ sg_init_table(&sg, 1); -+ sg_set_page(&sg, -+ is_vmalloc_addr(buf) -+ ? vmalloc_to_page(buf) -+ : virt_to_page(buf), -+ len, offset_in_page(buf)); -+ return do_encrypt_sg(tfm, nonce, &sg, len); -+ } else { -+ unsigned pages = buf_pages(buf, len); -+ struct scatterlist *sg; -+ size_t orig_len = len; -+ int ret, i; -+ -+ sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); -+ if (!sg) -+ return -BCH_ERR_ENOMEM_do_encrypt; -+ -+ sg_init_table(sg, pages); -+ -+ for (i = 0; i < pages; i++) { -+ unsigned offset = offset_in_page(buf); -+ unsigned pg_len = min(len, PAGE_SIZE - offset); -+ -+ sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); -+ buf += pg_len; -+ len -= pg_len; -+ } -+ -+ ret = do_encrypt_sg(tfm, nonce, sg, orig_len); -+ kfree(sg); -+ return ret; -+ } -+} -+ -+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct crypto_sync_skcipher *chacha20 = -+ crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ int ret; -+ -+ if (!chacha20) { -+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); -+ return PTR_ERR(chacha20); -+ } -+ -+ ret = crypto_skcipher_setkey(&chacha20->base, -+ (void *) key, sizeof(*key)); -+ if (ret) { -+ pr_err("crypto_skcipher_setkey() error: %i", ret); -+ goto err; -+ } -+ -+ ret = do_encrypt(chacha20, nonce, buf, len); -+err: -+ crypto_free_sync_skcipher(chacha20); -+ return ret; -+} -+ -+static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -+ struct nonce nonce) -+{ -+ u8 key[POLY1305_KEY_SIZE]; -+ int ret; -+ -+ nonce.d[3] ^= BCH_NONCE_POLY; -+ -+ memset(key, 0, sizeof(key)); -+ ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); -+ if (ret) -+ return ret; -+ -+ desc->tfm = c->poly1305; -+ crypto_shash_init(desc); -+ crypto_shash_update(desc, key, sizeof(key)); -+ return 0; -+} -+ -+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, -+ struct nonce nonce, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_none: -+ case BCH_CSUM_crc32c_nonzero: -+ case BCH_CSUM_crc64_nonzero: -+ case BCH_CSUM_crc32c: -+ case BCH_CSUM_xxhash: -+ case BCH_CSUM_crc64: { -+ struct bch2_checksum_state state; -+ -+ state.type = type; -+ -+ bch2_checksum_init(&state); -+ bch2_checksum_update(&state, data, len); -+ -+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; -+ } -+ -+ case BCH_CSUM_chacha20_poly1305_80: -+ case BCH_CSUM_chacha20_poly1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+ crypto_shash_update(desc, data, len); -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_encrypt(struct bch_fs *c, unsigned type, -+ struct nonce nonce, void *data, size_t len) -+{ -+ if (!bch2_csum_type_is_encryption(type)) -+ return 0; -+ -+ return do_encrypt(c->chacha20, nonce, data, len); -+} -+ -+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio, -+ struct bvec_iter *iter) -+{ -+ struct bio_vec bv; -+ -+ switch (type) { -+ case BCH_CSUM_none: -+ return (struct bch_csum) { 0 }; -+ case BCH_CSUM_crc32c_nonzero: -+ case BCH_CSUM_crc64_nonzero: -+ case BCH_CSUM_crc32c: -+ case BCH_CSUM_xxhash: -+ case BCH_CSUM_crc64: { -+ struct bch2_checksum_state state; -+ -+ state.type = type; -+ bch2_checksum_init(&state); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; -+ -+ bch2_checksum_update(&state, p, bv.bv_len); -+ kunmap_local(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; -+ } -+ -+ case BCH_CSUM_chacha20_poly1305_80: -+ case BCH_CSUM_chacha20_poly1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; -+ -+ crypto_shash_update(desc, p, bv.bv_len); -+ kunmap_local(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crypto_shash_update(desc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ -+ return __bch2_checksum_bio(c, type, nonce, bio, &iter); -+} -+ -+int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ struct scatterlist sgl[16], *sg = sgl; -+ size_t bytes = 0; -+ int ret = 0; -+ -+ if (!bch2_csum_type_is_encryption(type)) -+ return 0; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ -+ bio_for_each_segment(bv, bio, iter) { -+ if (sg == sgl + ARRAY_SIZE(sgl)) { -+ sg_mark_end(sg - 1); -+ -+ ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+ if (ret) -+ return ret; -+ -+ nonce = nonce_add(nonce, bytes); -+ bytes = 0; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ sg = sgl; -+ } -+ -+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); -+ bytes += bv.bv_len; -+ } -+ -+ sg_mark_end(sg - 1); -+ return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, -+ struct bch_csum b, size_t b_len) -+{ -+ struct bch2_checksum_state state; -+ -+ state.type = type; -+ bch2_checksum_init(&state); -+ state.seed = (u64 __force) a.lo; -+ -+ BUG_ON(!bch2_checksum_mergeable(type)); -+ -+ while (b_len) { -+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); -+ -+ bch2_checksum_update(&state, -+ page_address(ZERO_PAGE(0)), b); -+ b_len -= b; -+ } -+ a.lo = (__le64 __force) bch2_checksum_final(&state); -+ a.lo ^= b.lo; -+ a.hi ^= b.hi; -+ return a; -+} -+ -+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc_old, -+ struct bch_extent_crc_unpacked *crc_a, -+ struct bch_extent_crc_unpacked *crc_b, -+ unsigned len_a, unsigned len_b, -+ unsigned new_csum_type) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ struct nonce nonce = extent_nonce(version, crc_old); -+ struct bch_csum merged = { 0 }; -+ struct crc_split { -+ struct bch_extent_crc_unpacked *crc; -+ unsigned len; -+ unsigned csum_type; -+ struct bch_csum csum; -+ } splits[3] = { -+ { crc_a, len_a, new_csum_type }, -+ { crc_b, len_b, new_csum_type }, -+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, -+ }, *i; -+ bool mergeable = crc_old.csum_type == new_csum_type && -+ bch2_checksum_mergeable(new_csum_type); -+ unsigned crc_nonce = crc_old.nonce; -+ -+ BUG_ON(len_a + len_b > bio_sectors(bio)); -+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); -+ BUG_ON(crc_is_compressed(crc_old)); -+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)); -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ iter.bi_size = i->len << 9; -+ if (mergeable || i->crc) -+ i->csum = __bch2_checksum_bio(c, i->csum_type, -+ nonce, bio, &iter); -+ else -+ bio_advance_iter(bio, &iter, i->len << 9); -+ nonce = nonce_add(nonce, i->len << 9); -+ } -+ -+ if (mergeable) -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) -+ merged = bch2_checksum_merge(new_csum_type, merged, -+ i->csum, i->len << 9); -+ else -+ merged = bch2_checksum_bio(c, crc_old.csum_type, -+ extent_nonce(version, crc_old), bio); -+ -+ if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) { -+ bch_err(c, "checksum error in %s() (memory corruption or bug?)\n" -+ "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", -+ __func__, -+ crc_old.csum.hi, -+ crc_old.csum.lo, -+ merged.hi, -+ merged.lo, -+ bch2_csum_types[crc_old.csum_type], -+ bch2_csum_types[new_csum_type]); -+ return -EIO; -+ } -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ if (i->crc) -+ *i->crc = (struct bch_extent_crc_unpacked) { -+ .csum_type = i->csum_type, -+ .compression_type = crc_old.compression_type, -+ .compressed_size = i->len, -+ .uncompressed_size = i->len, -+ .offset = 0, -+ .live_size = i->len, -+ .nonce = crc_nonce, -+ .csum = i->csum, -+ }; -+ -+ if (bch2_csum_type_is_encryption(new_csum_type)) -+ crc_nonce += i->len; -+ } -+ -+ return 0; -+} -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+static int bch2_sb_crypt_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { -+ prt_printf(err, "wrong size (got %zu should be %zu)", -+ vstruct_bytes(&crypt->field), sizeof(*crypt)); -+ return -BCH_ERR_invalid_sb_crypt; -+ } -+ -+ if (BCH_CRYPT_KDF_TYPE(crypt)) { -+ prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); -+ return -BCH_ERR_invalid_sb_crypt; -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); -+ prt_newline(out); -+ prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_crypt_validate, -+ .to_text = bch2_sb_crypt_to_text, -+}; -+ -+#ifdef __KERNEL__ -+static int __bch2_request_key(char *key_description, struct bch_key *key) -+{ -+ struct key *keyring_key; -+ const struct user_key_payload *ukp; -+ int ret; -+ -+ keyring_key = request_key(&key_type_user, key_description, NULL); -+ if (IS_ERR(keyring_key)) -+ return PTR_ERR(keyring_key); -+ -+ down_read(&keyring_key->sem); -+ ukp = dereference_key_locked(keyring_key); -+ if (ukp->datalen == sizeof(*key)) { -+ memcpy(key, ukp->data, ukp->datalen); -+ ret = 0; -+ } else { -+ ret = -EINVAL; -+ } -+ up_read(&keyring_key->sem); -+ key_put(keyring_key); -+ -+ return ret; -+} -+#else -+#include -+ -+static int __bch2_request_key(char *key_description, struct bch_key *key) -+{ -+ key_serial_t key_id; -+ -+ key_id = request_key("user", key_description, NULL, -+ KEY_SPEC_USER_KEYRING); -+ if (key_id < 0) -+ return -errno; -+ -+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) -+ return -1; -+ -+ return 0; -+} -+#endif -+ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ struct printbuf key_description = PRINTBUF; -+ int ret; -+ -+ prt_printf(&key_description, "bcachefs:"); -+ pr_uuid(&key_description, sb->user_uuid.b); -+ -+ ret = __bch2_request_key(key_description.buf, key); -+ printbuf_exit(&key_description); -+ return ret; -+} -+ -+int bch2_decrypt_sb_key(struct bch_fs *c, -+ struct bch_sb_field_crypt *crypt, -+ struct bch_key *key) -+{ -+ struct bch_encrypted_key sb_key = crypt->key; -+ struct bch_key user_key; -+ int ret = 0; -+ -+ /* is key encrypted? */ -+ if (!bch2_key_is_encrypted(&sb_key)) -+ goto out; -+ -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ /* decrypt real key: */ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &sb_key, sizeof(sb_key)); -+ if (ret) -+ goto err; -+ -+ if (bch2_key_is_encrypted(&sb_key)) { -+ bch_err(c, "incorrect encryption key"); -+ ret = -EINVAL; -+ goto err; -+ } -+out: -+ *key = sb_key.key; -+err: -+ memzero_explicit(&sb_key, sizeof(sb_key)); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ return ret; -+} -+ -+static int bch2_alloc_ciphers(struct bch_fs *c) -+{ -+ int ret; -+ -+ if (!c->chacha20) -+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ ret = PTR_ERR_OR_ZERO(c->chacha20); -+ -+ if (ret) { -+ bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ if (!c->poly1305) -+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); -+ ret = PTR_ERR_OR_ZERO(c->poly1305); -+ -+ if (ret) { -+ bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_disable_encryption(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ /* is key encrypted? */ -+ ret = 0; -+ if (bch2_key_is_encrypted(&crypt->key)) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ crypt->key.magic = cpu_to_le64(BCH_KEY_MAGIC); -+ crypt->key.key = key; -+ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_enable_encryption(struct bch_fs *c, bool keyed) -+{ -+ struct bch_encrypted_key key; -+ struct bch_key user_key; -+ struct bch_sb_field_crypt *crypt; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ /* Do we already have an encryption key? */ -+ if (bch2_sb_get_crypt(c->disk_sb.sb)) -+ goto err; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto err; -+ -+ key.magic = cpu_to_le64(BCH_KEY_MAGIC); -+ get_random_bytes(&key.key, sizeof(key.key)); -+ -+ if (keyed) { -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &key, sizeof(key)); -+ if (ret) -+ goto err; -+ } -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto err; -+ -+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); -+ if (!crypt) { -+ ret = -BCH_ERR_ENOSPC_sb_crypt; -+ goto err; -+ } -+ -+ crypt->key = key; -+ -+ /* write superblock */ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); -+ bch2_write_super(c); -+err: -+ mutex_unlock(&c->sb_lock); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ memzero_explicit(&key, sizeof(key)); -+ return ret; -+} -+ -+void bch2_fs_encryption_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->poly1305)) -+ crypto_free_shash(c->poly1305); -+ if (!IS_ERR_OR_NULL(c->chacha20)) -+ crypto_free_sync_skcipher(c->chacha20); -+ if (!IS_ERR_OR_NULL(c->sha256)) -+ crypto_free_shash(c->sha256); -+} -+ -+int bch2_fs_encryption_init(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = 0; -+ -+ c->sha256 = crypto_alloc_shash("sha256", 0, 0); -+ ret = PTR_ERR_OR_ZERO(c->sha256); -+ if (ret) { -+ bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); -+ goto out; -+ } -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto out; -+out: -+ memzero_explicit(&key, sizeof(key)); -+ return ret; -+} -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -new file mode 100644 -index 000000000..c7b1a8fca ---- /dev/null -+++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,211 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHECKSUM_H -+#define _BCACHEFS_CHECKSUM_H -+ -+#include "bcachefs.h" -+#include "extents_types.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static inline bool bch2_checksum_mergeable(unsigned type) -+{ -+ -+ switch (type) { -+ case BCH_CSUM_none: -+ case BCH_CSUM_crc32c: -+ case BCH_CSUM_crc64: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, -+ struct bch_csum, size_t); -+ -+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -+#define BCH_NONCE_POLY cpu_to_le32(1 << 31) -+ -+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, -+ const void *, size_t); -+ -+/* -+ * This is used for various on disk data structures - bch_sb, prio_set, bset, -+ * jset: The checksum is _always_ the first field of these structs -+ */ -+#define csum_vstruct(_c, _type, _nonce, _i) \ -+({ \ -+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ -+ const void *end = vstruct_end(_i); \ -+ \ -+ bch2_checksum(_c, _type, _nonce, start, end - start); \ -+}) -+ -+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -+int bch2_request_key(struct bch_sb *, struct bch_key *); -+ -+int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, -+ void *data, size_t); -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, -+ struct bch_extent_crc_unpacked, -+ struct bch_extent_crc_unpacked *, -+ struct bch_extent_crc_unpacked *, -+ unsigned, unsigned, unsigned); -+ -+int __bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ return bch2_csum_type_is_encryption(type) -+ ? __bch2_encrypt_bio(c, type, nonce, bio) -+ : 0; -+} -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; -+ -+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, -+ struct bch_key *); -+ -+int bch2_disable_encryption(struct bch_fs *); -+int bch2_enable_encryption(struct bch_fs *, bool); -+ -+void bch2_fs_encryption_exit(struct bch_fs *); -+int bch2_fs_encryption_init(struct bch_fs *); -+ -+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, -+ bool data) -+{ -+ switch (type) { -+ case BCH_CSUM_OPT_none: -+ return BCH_CSUM_none; -+ case BCH_CSUM_OPT_crc32c: -+ return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; -+ case BCH_CSUM_OPT_crc64: -+ return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; -+ case BCH_CSUM_OPT_xxhash: -+ return BCH_CSUM_xxhash; -+ default: -+ BUG(); -+ } -+} -+ -+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, -+ struct bch_io_opts opts) -+{ -+ if (opts.nocow) -+ return 0; -+ -+ if (c->sb.encryption_type) -+ return c->opts.wide_macs -+ ? BCH_CSUM_chacha20_poly1305_128 -+ : BCH_CSUM_chacha20_poly1305_80; -+ -+ return bch2_csum_opt_to_type(opts.data_checksum, true); -+} -+ -+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -+{ -+ if (c->sb.encryption_type) -+ return BCH_CSUM_chacha20_poly1305_128; -+ -+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -+} -+ -+static inline bool bch2_checksum_type_valid(const struct bch_fs *c, -+ unsigned type) -+{ -+ if (type >= BCH_CSUM_NR) -+ return false; -+ -+ if (bch2_csum_type_is_encryption(type) && !c->chacha20) -+ return false; -+ -+ return true; -+} -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -+{ -+ /* -+ * XXX: need some way of preventing the compiler from optimizing this -+ * into a form that isn't constant time.. -+ */ -+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -+} -+ -+/* for skipping ahead and encrypting/decrypting at an offset: */ -+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -+{ -+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); -+ -+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); -+ return nonce; -+} -+ -+static inline struct nonce null_nonce(void) -+{ -+ struct nonce ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ return ret; -+} -+ -+static inline struct nonce extent_nonce(struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ unsigned compression_type = crc_is_compressed(crc) -+ ? crc.compression_type -+ : 0; -+ unsigned size = compression_type ? crc.uncompressed_size : 0; -+ struct nonce nonce = (struct nonce) {{ -+ [0] = cpu_to_le32(size << 22), -+ [1] = cpu_to_le32(version.lo), -+ [2] = cpu_to_le32(version.lo >> 32), -+ [3] = cpu_to_le32(version.hi| -+ (compression_type << 24))^BCH_NONCE_EXTENT, -+ }}; -+ -+ return nonce_add(nonce, crc.nonce << 9); -+} -+ -+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -+{ -+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -+} -+ -+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -+{ -+ __le64 magic = __bch2_sb_magic(sb); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -+{ -+ __le64 magic = bch2_sb_magic(c); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+#endif /* _BCACHEFS_CHECKSUM_H */ -diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c -new file mode 100644 -index 000000000..f41889093 ---- /dev/null -+++ b/fs/bcachefs/clock.c -@@ -0,0 +1,193 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "clock.h" -+ -+#include -+#include -+#include -+ -+static inline long io_timer_cmp(io_timer_heap *h, -+ struct io_timer *l, -+ struct io_timer *r) -+{ -+ return l->expire - r->expire; -+} -+ -+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (time_after_eq((unsigned long) atomic64_read(&clock->now), -+ timer->expire)) { -+ spin_unlock(&clock->timer_lock); -+ timer->fn(timer); -+ return; -+ } -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) -+ goto out; -+ -+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); -+out: -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) { -+ heap_del(&clock->timers, i, io_timer_cmp, NULL); -+ break; -+ } -+ -+ spin_unlock(&clock->timer_lock); -+} -+ -+struct io_clock_wait { -+ struct io_timer io_timer; -+ struct timer_list cpu_timer; -+ struct task_struct *task; -+ int expired; -+}; -+ -+static void io_clock_wait_fn(struct io_timer *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, io_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+static void io_clock_cpu_timeout(struct timer_list *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, cpu_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) -+{ -+ struct io_clock_wait wait; -+ -+ /* XXX: calculate sleep time rigorously */ -+ wait.io_timer.expire = until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ schedule(); -+ -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+void bch2_kthread_io_clock_wait(struct io_clock *clock, -+ unsigned long io_until, -+ unsigned long cpu_timeout) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct io_clock_wait wait; -+ -+ wait.io_timer.expire = io_until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); -+ -+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) -+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if (wait.expired) -+ break; -+ -+ schedule(); -+ try_to_freeze(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ del_timer_sync(&wait.cpu_timer); -+ destroy_timer_on_stack(&wait.cpu_timer); -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+static struct io_timer *get_expired_timer(struct io_clock *clock, -+ unsigned long now) -+{ -+ struct io_timer *ret = NULL; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (clock->timers.used && -+ time_after_eq(now, clock->timers.data[0]->expire)) -+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); -+ -+ spin_unlock(&clock->timer_lock); -+ -+ return ret; -+} -+ -+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) -+{ -+ struct io_timer *timer; -+ unsigned long now = atomic64_add_return(sectors, &clock->now); -+ -+ while ((timer = get_expired_timer(clock, now))) -+ timer->fn(timer); -+} -+ -+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -+{ -+ unsigned long now; -+ unsigned i; -+ -+ out->atomic++; -+ spin_lock(&clock->timer_lock); -+ now = atomic64_read(&clock->now); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ prt_printf(out, "%ps:\t%li\n", -+ clock->timers.data[i]->fn, -+ clock->timers.data[i]->expire - now); -+ spin_unlock(&clock->timer_lock); -+ --out->atomic; -+} -+ -+void bch2_io_clock_exit(struct io_clock *clock) -+{ -+ free_heap(&clock->timers); -+ free_percpu(clock->pcpu_buf); -+} -+ -+int bch2_io_clock_init(struct io_clock *clock) -+{ -+ atomic64_set(&clock->now, 0); -+ spin_lock_init(&clock->timer_lock); -+ -+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); -+ -+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); -+ if (!clock->pcpu_buf) -+ return -BCH_ERR_ENOMEM_io_clock_init; -+ -+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) -+ return -BCH_ERR_ENOMEM_io_clock_init; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h -new file mode 100644 -index 000000000..70a0f7436 ---- /dev/null -+++ b/fs/bcachefs/clock.h -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_H -+#define _BCACHEFS_CLOCK_H -+ -+void bch2_io_timer_add(struct io_clock *, struct io_timer *); -+void bch2_io_timer_del(struct io_clock *, struct io_timer *); -+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, -+ unsigned long); -+ -+void __bch2_increment_clock(struct io_clock *, unsigned); -+ -+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, -+ int rw) -+{ -+ struct io_clock *clock = &c->io_clock[rw]; -+ -+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= -+ IO_CLOCK_PCPU_SECTORS)) -+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); -+ -+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -+({ \ -+ long __ret = timeout; \ -+ might_sleep(); \ -+ if (!___wait_cond_timeout(condition)) \ -+ __ret = __wait_event_timeout(wq, condition, timeout); \ -+ __ret; \ -+}) -+ -+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); -+ -+void bch2_io_clock_exit(struct io_clock *); -+int bch2_io_clock_init(struct io_clock *); -+ -+#endif /* _BCACHEFS_CLOCK_H */ -diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h -new file mode 100644 -index 000000000..5fae0012d ---- /dev/null -+++ b/fs/bcachefs/clock_types.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_TYPES_H -+#define _BCACHEFS_CLOCK_TYPES_H -+ -+#include "util.h" -+ -+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) -+ -+/* -+ * Clocks/timers in units of sectors of IO: -+ * -+ * Note - they use percpu batching, so they're only approximate. -+ */ -+ -+struct io_timer; -+typedef void (*io_timer_fn)(struct io_timer *); -+ -+struct io_timer { -+ io_timer_fn fn; -+ unsigned long expire; -+}; -+ -+/* Amount to buffer up on a percpu counter */ -+#define IO_CLOCK_PCPU_SECTORS 128 -+ -+typedef HEAP(struct io_timer *) io_timer_heap; -+ -+struct io_clock { -+ atomic64_t now; -+ u16 __percpu *pcpu_buf; -+ unsigned max_slop; -+ -+ spinlock_t timer_lock; -+ io_timer_heap timers; -+}; -+ -+#endif /* _BCACHEFS_CLOCK_TYPES_H */ -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -new file mode 100644 -index 000000000..6b17f7cc5 ---- /dev/null -+++ b/fs/bcachefs/compress.c -@@ -0,0 +1,714 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "compress.h" -+#include "extents.h" -+#include "io.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+ -+/* Bounce buffer: */ -+struct bbuf { -+ void *b; -+ enum { -+ BB_NONE, -+ BB_VMAP, -+ BB_KMALLOC, -+ BB_MEMPOOL, -+ } type; -+ int rw; -+}; -+ -+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -+{ -+ void *b; -+ -+ BUG_ON(size > c->opts.encoded_extent_max); -+ -+ b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; -+ -+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; -+ -+ BUG(); -+} -+ -+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ void *expected_start = NULL; -+ -+ __bio_for_each_bvec(bv, bio, iter, start) { -+ if (expected_start && -+ expected_start != page_address(bv.bv_page) + bv.bv_offset) -+ return false; -+ -+ expected_start = page_address(bv.bv_page) + -+ bv.bv_offset + bv.bv_len; -+ } -+ -+ return true; -+} -+ -+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, -+ struct bvec_iter start, int rw) -+{ -+ struct bbuf ret; -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ unsigned nr_pages = 0; -+ struct page *stack_pages[16]; -+ struct page **pages = NULL; -+ void *data; -+ -+ BUG_ON(start.bi_size > c->opts.encoded_extent_max); -+ -+ if (!PageHighMem(bio_iter_page(bio, start)) && -+ bio_phys_contig(bio, start)) -+ return (struct bbuf) { -+ .b = page_address(bio_iter_page(bio, start)) + -+ bio_iter_offset(bio, start), -+ .type = BB_NONE, .rw = rw -+ }; -+ -+ /* check if we can map the pages contiguously: */ -+ __bio_for_each_segment(bv, bio, iter, start) { -+ if (iter.bi_size != start.bi_size && -+ bv.bv_offset) -+ goto bounce; -+ -+ if (bv.bv_len < iter.bi_size && -+ bv.bv_offset + bv.bv_len < PAGE_SIZE) -+ goto bounce; -+ -+ nr_pages++; -+ } -+ -+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); -+ -+ pages = nr_pages > ARRAY_SIZE(stack_pages) -+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) -+ : stack_pages; -+ if (!pages) -+ goto bounce; -+ -+ nr_pages = 0; -+ __bio_for_each_segment(bv, bio, iter, start) -+ pages[nr_pages++] = bv.bv_page; -+ -+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); -+ if (pages != stack_pages) -+ kfree(pages); -+ -+ if (data) -+ return (struct bbuf) { -+ .b = data + bio_iter_offset(bio, start), -+ .type = BB_VMAP, .rw = rw -+ }; -+bounce: -+ ret = __bounce_alloc(c, start.bi_size, rw); -+ -+ if (rw == READ) -+ memcpy_from_bio(ret.b, bio, start); -+ -+ return ret; -+} -+ -+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -+{ -+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -+} -+ -+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -+{ -+ switch (buf.type) { -+ case BB_NONE: -+ break; -+ case BB_VMAP: -+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); -+ break; -+ case BB_KMALLOC: -+ kfree(buf.b); -+ break; -+ case BB_MEMPOOL: -+ mempool_free(buf.b, &c->compression_bounce[buf.rw]); -+ break; -+ } -+} -+ -+static inline void zlib_set_workspace(z_stream *strm, void *workspace) -+{ -+#ifdef __KERNEL__ -+ strm->workspace = workspace; -+#endif -+} -+ -+static int __bio_uncompress(struct bch_fs *c, struct bio *src, -+ void *dst_data, struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf src_data = { NULL }; -+ size_t src_len = src->bi_iter.bi_size; -+ size_t dst_len = crc.uncompressed_size << 9; -+ void *workspace; -+ int ret; -+ -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ switch (crc.compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4_old: -+ case BCH_COMPRESSION_TYPE_lz4: -+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, -+ src_len, dst_len, dst_len); -+ if (ret != dst_len) -+ goto err; -+ break; -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src_data.b, -+ .avail_in = src_len, -+ .next_out = dst_data, -+ .avail_out = dst_len, -+ }; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_inflateInit2(&strm, -MAX_WBITS); -+ ret = zlib_inflate(&strm, Z_FINISH); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != Z_STREAM_END) -+ goto err; -+ break; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_DCtx *ctx; -+ size_t real_src_len = le32_to_cpup(src_data.b); -+ -+ if (real_src_len > src_len - 4) -+ goto err; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); -+ ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); -+ -+ ret = zstd_decompress_dctx(ctx, -+ dst_data, dst_len, -+ src_data.b + 4, real_src_len); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != dst_len) -+ goto err; -+ break; -+ } -+ default: -+ BUG(); -+ } -+ ret = 0; -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ return ret; -+err: -+ ret = -EIO; -+ goto out; -+} -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, -+ struct bch_extent_crc_unpacked *crc) -+{ -+ struct bbuf data = { NULL }; -+ size_t dst_len = crc->uncompressed_size << 9; -+ -+ /* bio must own its pages: */ -+ BUG_ON(!bio->bi_vcnt); -+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); -+ -+ if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || -+ crc->compressed_size << 9 > c->opts.encoded_extent_max) { -+ bch_err(c, "error rewriting existing data: extent too big"); -+ return -EIO; -+ } -+ -+ data = __bounce_alloc(c, dst_len, WRITE); -+ -+ if (__bio_uncompress(c, bio, data.b, *crc)) { -+ if (!c->opts.no_data_io) -+ bch_err(c, "error rewriting existing data: decompression error"); -+ bio_unmap_or_unbounce(c, data); -+ return -EIO; -+ } -+ -+ /* -+ * XXX: don't have a good way to assert that the bio was allocated with -+ * enough space, we depend on bch2_move_extent doing the right thing -+ */ -+ bio->bi_iter.bi_size = crc->live_size << 9; -+ -+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); -+ -+ crc->csum_type = 0; -+ crc->compression_type = 0; -+ crc->compressed_size = crc->live_size; -+ crc->uncompressed_size = crc->live_size; -+ crc->offset = 0; -+ crc->csum = (struct bch_csum) { 0, 0 }; -+ -+ bio_unmap_or_unbounce(c, data); -+ return 0; -+} -+ -+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, -+ struct bio *dst, struct bvec_iter dst_iter, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf dst_data = { NULL }; -+ size_t dst_len = crc.uncompressed_size << 9; -+ int ret; -+ -+ if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || -+ crc.compressed_size << 9 > c->opts.encoded_extent_max) -+ return -EIO; -+ -+ dst_data = dst_len == dst_iter.bi_size -+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) -+ : __bounce_alloc(c, dst_len, WRITE); -+ -+ ret = __bio_uncompress(c, src, dst_data.b, crc); -+ if (ret) -+ goto err; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -+err: -+ bio_unmap_or_unbounce(c, dst_data); -+ return ret; -+} -+ -+static int attempt_compress(struct bch_fs *c, -+ void *workspace, -+ void *dst, size_t dst_len, -+ void *src, size_t src_len, -+ struct bch_compression_opt compression) -+{ -+ enum bch_compression_type compression_type = -+ __bch2_compression_opt_to_type[compression.type]; -+ -+ switch (compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4: -+ if (compression.level < LZ4HC_MIN_CLEVEL) { -+ int len = src_len; -+ int ret = LZ4_compress_destSize( -+ src, dst, -+ &len, dst_len, -+ workspace); -+ if (len < src_len) -+ return -len; -+ -+ return ret; -+ } else { -+ int ret = LZ4_compress_HC( -+ src, dst, -+ src_len, dst_len, -+ compression.level, -+ workspace); -+ -+ return ret ?: -1; -+ } -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src, -+ .avail_in = src_len, -+ .next_out = dst, -+ .avail_out = dst_len, -+ }; -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_deflateInit2(&strm, -+ compression.level -+ ? clamp_t(unsigned, compression.level, -+ Z_BEST_SPEED, Z_BEST_COMPRESSION) -+ : Z_DEFAULT_COMPRESSION, -+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, -+ Z_DEFAULT_STRATEGY); -+ -+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) -+ return 0; -+ -+ if (zlib_deflateEnd(&strm) != Z_OK) -+ return 0; -+ -+ return strm.total_out; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ /* -+ * rescale: -+ * zstd max compression level is 22, our max level is 15 -+ */ -+ unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); -+ ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); -+ ZSTD_CCtx *ctx = zstd_init_cctx(workspace, -+ zstd_cctx_workspace_bound(¶ms.cParams)); -+ -+ /* -+ * ZSTD requires that when we decompress we pass in the exact -+ * compressed size - rounding it up to the nearest sector -+ * doesn't work, so we use the first 4 bytes of the buffer for -+ * that. -+ * -+ * Additionally, the ZSTD code seems to have a bug where it will -+ * write just past the end of the buffer - so subtract a fudge -+ * factor (7 bytes) from the dst buffer size to account for -+ * that. -+ */ -+ size_t len = zstd_compress_cctx(ctx, -+ dst + 4, dst_len - 4 - 7, -+ src, src_len, -+ &c->zstd_params); -+ if (zstd_is_error(len)) -+ return 0; -+ -+ *((__le32 *) dst) = cpu_to_le32(len); -+ return len + 4; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned __bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ struct bch_compression_opt compression) -+{ -+ struct bbuf src_data = { NULL }, dst_data = { NULL }; -+ void *workspace; -+ enum bch_compression_type compression_type = -+ __bch2_compression_opt_to_type[compression.type]; -+ unsigned pad; -+ int ret = 0; -+ -+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); -+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); -+ -+ /* If it's only one block, don't bother trying to compress: */ -+ if (src->bi_iter.bi_size <= c->opts.block_size) -+ return BCH_COMPRESSION_TYPE_incompressible; -+ -+ dst_data = bio_map_or_bounce(c, dst, WRITE); -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); -+ -+ *src_len = src->bi_iter.bi_size; -+ *dst_len = dst->bi_iter.bi_size; -+ -+ /* -+ * XXX: this algorithm sucks when the compression code doesn't tell us -+ * how much would fit, like LZ4 does: -+ */ -+ while (1) { -+ if (*src_len <= block_bytes(c)) { -+ ret = -1; -+ break; -+ } -+ -+ ret = attempt_compress(c, workspace, -+ dst_data.b, *dst_len, -+ src_data.b, *src_len, -+ compression); -+ if (ret > 0) { -+ *dst_len = ret; -+ ret = 0; -+ break; -+ } -+ -+ /* Didn't fit: should we retry with a smaller amount? */ -+ if (*src_len <= *dst_len) { -+ ret = -1; -+ break; -+ } -+ -+ /* -+ * If ret is negative, it's a hint as to how much data would fit -+ */ -+ BUG_ON(-ret >= *src_len); -+ -+ if (ret < 0) -+ *src_len = -ret; -+ else -+ *src_len -= (*src_len - *dst_len) / 2; -+ *src_len = round_down(*src_len, block_bytes(c)); -+ } -+ -+ mempool_free(workspace, &c->compress_workspace[compression_type]); -+ -+ if (ret) -+ goto err; -+ -+ /* Didn't get smaller: */ -+ if (round_up(*dst_len, block_bytes(c)) >= *src_len) -+ goto err; -+ -+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; -+ -+ memset(dst_data.b + *dst_len, 0, pad); -+ *dst_len += pad; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); -+ -+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); -+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); -+ BUG_ON(*dst_len & (block_bytes(c) - 1)); -+ BUG_ON(*src_len & (block_bytes(c) - 1)); -+ ret = compression_type; -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ bio_unmap_or_unbounce(c, dst_data); -+ return ret; -+err: -+ ret = BCH_COMPRESSION_TYPE_incompressible; -+ goto out; -+} -+ -+unsigned bch2_bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ unsigned compression_opt) -+{ -+ unsigned orig_dst = dst->bi_iter.bi_size; -+ unsigned orig_src = src->bi_iter.bi_size; -+ unsigned compression_type; -+ -+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ -+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, -+ c->opts.encoded_extent_max); -+ /* Don't generate a bigger output than input: */ -+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ -+ compression_type = -+ __bio_compress(c, dst, dst_len, src, src_len, -+ bch2_compression_decode(compression_opt)); -+ -+ dst->bi_iter.bi_size = orig_dst; -+ src->bi_iter.bi_size = orig_src; -+ return compression_type; -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *, u64); -+ -+#define BCH_FEATURE_none 0 -+ -+static const unsigned bch2_compression_opt_to_feature[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+#undef BCH_FEATURE_none -+ -+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -+{ -+ int ret = 0; -+ -+ if ((c->sb.features & f) == f) -+ return 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if ((c->sb.features & f) == f) { -+ mutex_unlock(&c->sb_lock); -+ return 0; -+ } -+ -+ ret = __bch2_fs_compress_init(c, c->sb.features|f); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ret; -+ } -+ -+ c->disk_sb.sb->features[0] |= cpu_to_le64(f); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *c, -+ unsigned compression_opt) -+{ -+ unsigned compression_type = bch2_compression_decode(compression_opt).type; -+ -+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); -+ -+ return compression_type -+ ? __bch2_check_set_has_compressed_data(c, -+ 1ULL << bch2_compression_opt_to_feature[compression_type]) -+ : 0; -+} -+ -+void bch2_fs_compress_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ mempool_exit(&c->decompress_workspace); -+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) -+ mempool_exit(&c->compress_workspace[i]); -+ mempool_exit(&c->compression_bounce[WRITE]); -+ mempool_exit(&c->compression_bounce[READ]); -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -+{ -+ size_t decompress_workspace_size = 0; -+ bool decompress_workspace_needed; -+ ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), -+ c->opts.encoded_extent_max); -+ struct { -+ unsigned feature; -+ enum bch_compression_type type; -+ size_t compress_workspace; -+ size_t decompress_workspace; -+ } compression_types[] = { -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, -+ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, -+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, -+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -+ zlib_inflate_workspacesize(), }, -+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, -+ zstd_cctx_workspace_bound(¶ms.cParams), -+ zstd_dctx_workspace_bound() }, -+ }, *i; -+ bool have_compressed = false; -+ -+ c->zstd_params = params; -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) -+ have_compressed |= (features & (1 << i->feature)) != 0; -+ -+ if (!have_compressed) -+ return 0; -+ -+ if (!mempool_initialized(&c->compression_bounce[READ]) && -+ mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], -+ 1, c->opts.encoded_extent_max)) -+ return -BCH_ERR_ENOMEM_compression_bounce_read_init; -+ -+ if (!mempool_initialized(&c->compression_bounce[WRITE]) && -+ mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], -+ 1, c->opts.encoded_extent_max)) -+ return -BCH_ERR_ENOMEM_compression_bounce_write_init; -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) { -+ decompress_workspace_size = -+ max(decompress_workspace_size, i->decompress_workspace); -+ -+ if (!(features & (1 << i->feature))) -+ continue; -+ -+ if (i->decompress_workspace) -+ decompress_workspace_needed = true; -+ -+ if (mempool_initialized(&c->compress_workspace[i->type])) -+ continue; -+ -+ if (mempool_init_kvpmalloc_pool( -+ &c->compress_workspace[i->type], -+ 1, i->compress_workspace)) -+ return -BCH_ERR_ENOMEM_compression_workspace_init; -+ } -+ -+ if (!mempool_initialized(&c->decompress_workspace) && -+ mempool_init_kvpmalloc_pool(&c->decompress_workspace, -+ 1, decompress_workspace_size)) -+ return -BCH_ERR_ENOMEM_decompression_workspace_init; -+ -+ return 0; -+} -+ -+static u64 compression_opt_to_feature(unsigned v) -+{ -+ unsigned type = bch2_compression_decode(v).type; -+ -+ return BIT_ULL(bch2_compression_opt_to_feature[type]); -+} -+ -+int bch2_fs_compress_init(struct bch_fs *c) -+{ -+ u64 f = c->sb.features; -+ -+ f |= compression_opt_to_feature(c->opts.compression); -+ f |= compression_opt_to_feature(c->opts.background_compression); -+ -+ return __bch2_fs_compress_init(c, f); -+} -+ -+int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, -+ struct printbuf *err) -+{ -+ char *val = kstrdup(_val, GFP_KERNEL); -+ char *p = val, *type_str, *level_str; -+ struct bch_compression_opt opt = { 0 }; -+ int ret; -+ -+ if (!val) -+ return -ENOMEM; -+ -+ type_str = strsep(&p, ":"); -+ level_str = p; -+ -+ ret = match_string(bch2_compression_opts, -1, type_str); -+ if (ret < 0 && err) -+ prt_str(err, "invalid compression type"); -+ if (ret < 0) -+ goto err; -+ -+ opt.type = ret; -+ -+ if (level_str) { -+ unsigned level; -+ -+ ret = kstrtouint(level_str, 10, &level); -+ if (!ret && !opt.type && level) -+ ret = -EINVAL; -+ if (!ret && level > 15) -+ ret = -EINVAL; -+ if (ret < 0 && err) -+ prt_str(err, "invalid compression level"); -+ if (ret < 0) -+ goto err; -+ -+ opt.level = level; -+ } -+ -+ *res = bch2_compression_encode(opt); -+err: -+ kfree(val); -+ return ret; -+} -+ -+void bch2_opt_compression_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_sb *sb, -+ u64 v) -+{ -+ struct bch_compression_opt opt = bch2_compression_decode(v); -+ -+ prt_str(out, bch2_compression_opts[opt.type]); -+ if (opt.level) -+ prt_printf(out, ":%u", opt.level); -+} -diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h -new file mode 100644 -index 000000000..052ea3032 ---- /dev/null -+++ b/fs/bcachefs/compress.h -@@ -0,0 +1,55 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_COMPRESS_H -+#define _BCACHEFS_COMPRESS_H -+ -+#include "extents_types.h" -+ -+struct bch_compression_opt { -+ u8 type:4, -+ level:4; -+}; -+ -+static inline struct bch_compression_opt bch2_compression_decode(unsigned v) -+{ -+ return (struct bch_compression_opt) { -+ .type = v & 15, -+ .level = v >> 4, -+ }; -+} -+ -+static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) -+{ -+ return opt.type|(opt.level << 4); -+} -+ -+static const unsigned __bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) -+{ -+ return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; -+} -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, -+ struct bch_extent_crc_unpacked *); -+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, -+ struct bvec_iter, struct bch_extent_crc_unpacked); -+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, -+ struct bio *, size_t *, unsigned); -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -+void bch2_fs_compress_exit(struct bch_fs *); -+int bch2_fs_compress_init(struct bch_fs *); -+ -+int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -+void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -+ -+#define bch2_opt_compression (struct bch_opt_fn) { \ -+ .parse = bch2_opt_compression_parse, \ -+ .to_text = bch2_opt_compression_to_text, \ -+} -+ -+#endif /* _BCACHEFS_COMPRESS_H */ -diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c -new file mode 100644 -index 000000000..442a9b806 ---- /dev/null -+++ b/fs/bcachefs/counters.c -@@ -0,0 +1,107 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "super-io.h" -+#include "counters.h" -+ -+/* BCH_SB_FIELD_counters */ -+ -+static const char * const bch2_counter_names[] = { -+#define x(t, n, ...) (#t), -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+ NULL -+}; -+ -+static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) -+{ -+ if (!ctrs) -+ return 0; -+ -+ return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -+}; -+ -+static int bch2_sb_counters_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ return 0; -+}; -+ -+static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_counters *ctrs = field_to_type(f, counters); -+ unsigned int i; -+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); -+ -+ for (i = 0; i < nr; i++) { -+ if (i < BCH_COUNTER_NR) -+ prt_printf(out, "%s ", bch2_counter_names[i]); -+ else -+ prt_printf(out, "(unknown)"); -+ -+ prt_tab(out); -+ prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); -+ prt_newline(out); -+ }; -+}; -+ -+int bch2_sb_counters_to_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); -+ unsigned int i; -+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); -+ u64 val = 0; -+ -+ for (i = 0; i < BCH_COUNTER_NR; i++) -+ c->counters_on_mount[i] = 0; -+ -+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { -+ val = le64_to_cpu(ctrs->d[i]); -+ percpu_u64_set(&c->counters[i], val); -+ c->counters_on_mount[i] = val; -+ } -+ return 0; -+}; -+ -+int bch2_sb_counters_from_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); -+ struct bch_sb_field_counters *ret; -+ unsigned int i; -+ unsigned int nr = bch2_sb_counter_nr_entries(ctrs); -+ -+ if (nr < BCH_COUNTER_NR) { -+ ret = bch2_sb_resize_counters(&c->disk_sb, -+ sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); -+ -+ if (ret) { -+ ctrs = ret; -+ nr = bch2_sb_counter_nr_entries(ctrs); -+ } -+ } -+ -+ -+ for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) -+ ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); -+ return 0; -+} -+ -+void bch2_fs_counters_exit(struct bch_fs *c) -+{ -+ free_percpu(c->counters); -+} -+ -+int bch2_fs_counters_init(struct bch_fs *c) -+{ -+ c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); -+ if (!c->counters) -+ return -BCH_ERR_ENOMEM_fs_counters_init; -+ -+ return bch2_sb_counters_to_cpu(c); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_counters = { -+ .validate = bch2_sb_counters_validate, -+ .to_text = bch2_sb_counters_to_text, -+}; -diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h -new file mode 100644 -index 000000000..4778aa19b ---- /dev/null -+++ b/fs/bcachefs/counters.h -@@ -0,0 +1,17 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_COUNTERS_H -+#define _BCACHEFS_COUNTERS_H -+ -+#include "bcachefs.h" -+#include "super-io.h" -+ -+ -+int bch2_sb_counters_to_cpu(struct bch_fs *); -+int bch2_sb_counters_from_cpu(struct bch_fs *); -+ -+void bch2_fs_counters_exit(struct bch_fs *); -+int bch2_fs_counters_init(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_counters; -+ -+#endif // _BCACHEFS_COUNTERS_H -diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h -new file mode 100644 -index 000000000..114f86b45 ---- /dev/null -+++ b/fs/bcachefs/darray.h -@@ -0,0 +1,87 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DARRAY_H -+#define _BCACHEFS_DARRAY_H -+ -+/* -+ * Dynamic arrays: -+ * -+ * Inspired by CCAN's darray -+ */ -+ -+#include "util.h" -+#include -+ -+#define DARRAY(type) \ -+struct { \ -+ size_t nr, size; \ -+ type *data; \ -+} -+ -+typedef DARRAY(void) darray_void; -+ -+static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) -+{ -+ if (d->nr + more > d->size) { -+ size_t new_size = roundup_pow_of_two(d->nr + more); -+ void *data = krealloc_array(d->data, new_size, t_size, gfp); -+ -+ if (!data) -+ return -ENOMEM; -+ -+ d->data = data; -+ d->size = new_size; -+ } -+ -+ return 0; -+} -+ -+#define darray_make_room_gfp(_d, _more, _gfp) \ -+ __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) -+ -+#define darray_make_room(_d, _more) \ -+ darray_make_room_gfp(_d, _more, GFP_KERNEL) -+ -+#define darray_top(_d) ((_d).data[(_d).nr]) -+ -+#define darray_push_gfp(_d, _item, _gfp) \ -+({ \ -+ int _ret = darray_make_room_gfp((_d), 1, _gfp); \ -+ \ -+ if (!_ret) \ -+ (_d)->data[(_d)->nr++] = (_item); \ -+ _ret; \ -+}) -+ -+#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) -+ -+#define darray_pop(_d) ((_d)->data[--(_d)->nr]) -+ -+#define darray_first(_d) ((_d).data[0]) -+#define darray_last(_d) ((_d).data[(_d).nr - 1]) -+ -+#define darray_insert_item(_d, pos, _item) \ -+({ \ -+ size_t _pos = (pos); \ -+ int _ret = darray_make_room((_d), 1); \ -+ \ -+ if (!_ret) \ -+ array_insert_item((_d)->data, (_d)->nr, _pos, (_item)); \ -+ _ret; \ -+}) -+ -+#define darray_for_each(_d, _i) \ -+ for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) -+ -+#define darray_init(_d) \ -+do { \ -+ (_d)->data = NULL; \ -+ (_d)->nr = (_d)->size = 0; \ -+} while (0) -+ -+#define darray_exit(_d) \ -+do { \ -+ kfree((_d)->data); \ -+ darray_init(_d); \ -+} while (0) -+ -+#endif /* _BCACHEFS_DARRAY_H */ -diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c -new file mode 100644 -index 000000000..81518f20d ---- /dev/null -+++ b/fs/bcachefs/data_update.c -@@ -0,0 +1,562 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_buf.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "data_update.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "io.h" -+#include "keylist.h" -+#include "move.h" -+#include "nocow_locking.h" -+#include "subvolume.h" -+#include "trace.h" -+ -+static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (trace_move_extent_finish_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ trace_move_extent_finish(c, buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+ -+static void trace_move_extent_fail2(struct data_update *m, -+ struct bkey_s_c new, -+ struct bkey_s_c wrote, -+ struct bkey_i *insert, -+ const char *msg) -+{ -+ struct bch_fs *c = m->op.c; -+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_ptr *ptr; -+ struct extent_ptr_decoded p; -+ struct printbuf buf = PRINTBUF; -+ unsigned i, rewrites_found = 0; -+ -+ if (!trace_move_extent_fail_enabled()) -+ return; -+ -+ prt_str(&buf, msg); -+ -+ if (insert) { -+ i = 0; -+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { -+ struct bkey_s new_s; -+ new_s.k = (void *) new.k; -+ new_s.v = (void *) new.v; -+ -+ if (((1U << i) & m->data_opts.rewrite_ptrs) && -+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && -+ !ptr->cached) -+ rewrites_found |= 1U << i; -+ i++; -+ } -+ } -+ -+ prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", -+ (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, -+ (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, -+ (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, -+ (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); -+ -+ prt_printf(&buf, "\nrewrites found: %u%u%u%u", -+ (rewrites_found & (1 << 0)) != 0, -+ (rewrites_found & (1 << 1)) != 0, -+ (rewrites_found & (1 << 2)) != 0, -+ (rewrites_found & (1 << 3)) != 0); -+ -+ prt_str(&buf, "\nold: "); -+ bch2_bkey_val_to_text(&buf, c, old); -+ -+ prt_str(&buf, "\nnew: "); -+ bch2_bkey_val_to_text(&buf, c, new); -+ -+ prt_str(&buf, "\nwrote: "); -+ bch2_bkey_val_to_text(&buf, c, wrote); -+ -+ if (insert) { -+ prt_str(&buf, "\ninsert: "); -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); -+ } -+ -+ trace_move_extent_fail(c, buf.buf); -+ printbuf_exit(&buf); -+} -+ -+static int __bch2_data_update_index_update(struct btree_trans *trans, -+ struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_iter iter; -+ struct data_update *m = -+ container_of(op, struct data_update, op); -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_buf _new, _insert; -+ int ret = 0; -+ -+ bch2_bkey_buf_init(&_new); -+ bch2_bkey_buf_init(&_insert); -+ bch2_bkey_buf_realloc(&_insert, c, U8_MAX); -+ -+ bch2_trans_iter_init(trans, &iter, m->btree_id, -+ bkey_start_pos(&bch2_keylist_front(keys)->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (1) { -+ struct bkey_s_c k; -+ struct bkey_s_c old = bkey_i_to_s_c(m->k.k); -+ struct bkey_i *insert = NULL; -+ struct bkey_i_extent *new; -+ const union bch_extent_entry *entry_c; -+ union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_extent_ptr *ptr; -+ const struct bch_extent_ptr *ptr_c; -+ struct bpos next_pos; -+ bool should_check_enospc; -+ s64 i_sectors_delta = 0, disk_sectors_delta = 0; -+ unsigned rewrites_found = 0, durability, i; -+ -+ bch2_trans_begin(trans); -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ new = bkey_i_to_extent(bch2_keylist_front(keys)); -+ -+ if (!bch2_extents_match(k, old)) { -+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), -+ NULL, "no match:"); -+ goto nowork; -+ } -+ -+ bkey_reassemble(_insert.k, k); -+ insert = _insert.k; -+ -+ bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); -+ new = bkey_i_to_extent(_new.k); -+ bch2_cut_front(iter.pos, &new->k_i); -+ -+ bch2_cut_front(iter.pos, insert); -+ bch2_cut_back(new->k.p, insert); -+ bch2_cut_back(insert->k.p, &new->k_i); -+ -+ /* -+ * @old: extent that we read from -+ * @insert: key that we're going to update, initialized from -+ * extent currently in btree - same as @old unless we raced with -+ * other updates -+ * @new: extent with new pointers that we'll be adding to @insert -+ * -+ * Fist, drop rewrite_ptrs from @new: -+ */ -+ i = 0; -+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { -+ if (((1U << i) & m->data_opts.rewrite_ptrs) && -+ (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && -+ !ptr->cached) { -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); -+ /* -+ * See comment below: -+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); -+ */ -+ rewrites_found |= 1U << i; -+ } -+ i++; -+ } -+ -+ if (m->data_opts.rewrite_ptrs && -+ !rewrites_found && -+ bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { -+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); -+ goto nowork; -+ } -+ -+ /* -+ * A replica that we just wrote might conflict with a replica -+ * that we want to keep, due to racing with another move: -+ */ -+restart_drop_conflicting_replicas: -+ extent_for_each_ptr(extent_i_to_s(new), ptr) -+ if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && -+ !ptr_c->cached) { -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); -+ goto restart_drop_conflicting_replicas; -+ } -+ -+ if (!bkey_val_u64s(&new->k)) { -+ trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); -+ goto nowork; -+ } -+ -+ /* Now, drop pointers that conflict with what we just wrote: */ -+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) -+ if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); -+ -+ durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + -+ bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); -+ -+ /* Now, drop excess replicas: */ -+restart_drop_extra_replicas: -+ bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { -+ unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); -+ -+ if (!p.ptr.cached && -+ durability - ptr_durability >= m->op.opts.data_replicas) { -+ durability -= ptr_durability; -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); -+ /* -+ * Currently, we're dropping unneeded replicas -+ * instead of marking them as cached, since -+ * cached data in stripe buckets prevents them -+ * from being reused: -+ bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); -+ */ -+ goto restart_drop_extra_replicas; -+ } -+ } -+ -+ /* Finally, add the pointers we just wrote: */ -+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) -+ bch2_extent_ptr_decoded_append(insert, &p); -+ -+ bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); -+ bch2_extent_normalize(c, bkey_i_to_s(insert)); -+ -+ ret = bch2_sum_sector_overwrites(trans, &iter, insert, -+ &should_check_enospc, -+ &i_sectors_delta, -+ &disk_sectors_delta); -+ if (ret) -+ goto err; -+ -+ if (disk_sectors_delta > (s64) op->res.sectors) { -+ ret = bch2_disk_reservation_add(c, &op->res, -+ disk_sectors_delta - op->res.sectors, -+ !should_check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL : 0); -+ if (ret) -+ goto out; -+ } -+ -+ next_pos = insert->k.p; -+ -+ ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, -+ k.k->p, bkey_start_pos(&insert->k)) ?: -+ bch2_insert_snapshot_whiteouts(trans, m->btree_id, -+ k.k->p, insert->k.p); -+ if (ret) -+ goto err; -+ -+ ret = bch2_trans_update(trans, &iter, insert, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_trans_commit(trans, &op->res, -+ NULL, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ m->data_opts.btree_insert_flags); -+ if (!ret) { -+ bch2_btree_iter_set_pos(&iter, next_pos); -+ -+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); -+ trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); -+ } -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ if (ret) -+ break; -+next: -+ while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { -+ bch2_keylist_pop_front(keys); -+ if (bch2_keylist_empty(keys)) -+ goto out; -+ } -+ continue; -+nowork: -+ if (m->ctxt && m->ctxt->stats) { -+ BUG_ON(k.k->p.offset <= iter.pos.offset); -+ atomic64_inc(&m->ctxt->stats->keys_raced); -+ atomic64_add(k.k->p.offset - iter.pos.offset, -+ &m->ctxt->stats->sectors_raced); -+ } -+ -+ this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); -+ -+ bch2_btree_iter_advance(&iter); -+ goto next; -+ } -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ bch2_bkey_buf_exit(&_insert, c); -+ bch2_bkey_buf_exit(&_new, c); -+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); -+ return ret; -+} -+ -+int bch2_data_update_index_update(struct bch_write_op *op) -+{ -+ return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); -+} -+ -+void bch2_data_update_read_done(struct data_update *m, -+ struct bch_extent_crc_unpacked crc) -+{ -+ /* write bio must own pages: */ -+ BUG_ON(!m->op.wbio.bio.bi_vcnt); -+ -+ m->op.crc = crc; -+ m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; -+ -+ closure_call(&m->op.cl, bch2_write, NULL, NULL); -+} -+ -+void bch2_data_update_exit(struct data_update *update) -+{ -+ struct bch_fs *c = update->op.c; -+ struct bkey_ptrs_c ptrs = -+ bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (c->opts.nocow_enabled) -+ bch2_bucket_nocow_unlock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, ptr), 0); -+ percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); -+ } -+ -+ bch2_bkey_buf_exit(&update->k, c); -+ bch2_disk_reservation_put(c, &update->op.res); -+ bch2_bio_free_pages_pool(c, &update->op.wbio.bio); -+} -+ -+void bch2_update_unwritten_extent(struct btree_trans *trans, -+ struct data_update *update) -+{ -+ struct bch_fs *c = update->op.c; -+ struct bio *bio = &update->op.wbio.bio; -+ struct bkey_i_extent *e; -+ struct write_point *wp; -+ struct bch_extent_ptr *ptr; -+ struct closure cl; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ closure_init_stack(&cl); -+ bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); -+ -+ while (bio_sectors(bio)) { -+ unsigned sectors = bio_sectors(bio); -+ -+ bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, -+ BTREE_ITER_SLOTS); -+ ret = lockrestart_do(trans, ({ -+ k = bch2_btree_iter_peek_slot(&iter); -+ bkey_err(k); -+ })); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) -+ break; -+ -+ e = bkey_extent_init(update->op.insert_keys.top); -+ e->k.p = update->op.pos; -+ -+ ret = bch2_alloc_sectors_start_trans(trans, -+ update->op.target, -+ false, -+ update->op.write_point, -+ &update->op.devs_have, -+ update->op.nr_replicas, -+ update->op.nr_replicas, -+ update->op.watermark, -+ 0, &cl, &wp); -+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ continue; -+ } -+ -+ if (ret) -+ return; -+ -+ sectors = min(sectors, wp->sectors_free); -+ -+ bch2_key_resize(&e->k, sectors); -+ -+ bch2_open_bucket_get(c, wp, &update->op.open_buckets); -+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); -+ bch2_alloc_sectors_done(c, wp); -+ -+ bio_advance(bio, sectors << 9); -+ update->op.pos.offset += sectors; -+ -+ extent_for_each_ptr(extent_i_to_s(e), ptr) -+ ptr->unwritten = true; -+ bch2_keylist_push(&update->op.insert_keys); -+ -+ ret = __bch2_data_update_index_update(trans, &update->op); -+ -+ bch2_open_buckets_put(c, &update->op.open_buckets); -+ -+ if (ret) -+ break; -+ } -+ -+ if (closure_nr_remaining(&cl) != 1) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+} -+ -+int bch2_data_update_init(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct data_update *m, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ struct data_update_opts data_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ const struct bch_extent_ptr *ptr; -+ unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; -+ unsigned ptrs_locked = 0; -+ int ret; -+ -+ bch2_bkey_buf_init(&m->k); -+ bch2_bkey_buf_reassemble(&m->k, c, k); -+ m->btree_id = btree_id; -+ m->data_opts = data_opts; -+ -+ bch2_write_op_init(&m->op, c, io_opts); -+ m->op.pos = bkey_start_pos(k.k); -+ m->op.version = k.k->version; -+ m->op.target = data_opts.target; -+ m->op.write_point = wp; -+ m->op.nr_replicas = 0; -+ m->op.flags |= BCH_WRITE_PAGES_STABLE| -+ BCH_WRITE_PAGES_OWNED| -+ BCH_WRITE_DATA_ENCODED| -+ BCH_WRITE_MOVE| -+ m->data_opts.write_flags; -+ m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; -+ m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); -+ -+ i = 0; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ bool locked; -+ -+ if (((1U << i) & m->data_opts.rewrite_ptrs)) { -+ BUG_ON(p.ptr.cached); -+ -+ if (crc_is_compressed(p.crc)) -+ reserve_sectors += k.k->size; -+ -+ m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); -+ } else if (!p.ptr.cached) { -+ bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); -+ } -+ -+ /* -+ * op->csum_type is normally initialized from the fs/file's -+ * current options - but if an extent is encrypted, we require -+ * that it stays encrypted: -+ */ -+ if (bch2_csum_type_is_encryption(p.crc.csum_type)) { -+ m->op.nonce = p.crc.nonce + p.crc.offset; -+ m->op.csum_type = p.crc.csum_type; -+ } -+ -+ if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) -+ m->op.incompressible = true; -+ -+ if (c->opts.nocow_enabled) { -+ if (ctxt) { -+ move_ctxt_wait_event(ctxt, trans, -+ (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, &p.ptr), 0)) || -+ !atomic_read(&ctxt->read_sectors)); -+ -+ if (!locked) -+ bch2_bucket_nocow_lock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, &p.ptr), 0); -+ } else { -+ if (!bch2_bucket_nocow_trylock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, &p.ptr), 0)) { -+ ret = -BCH_ERR_nocow_lock_blocked; -+ goto err; -+ } -+ } -+ ptrs_locked |= (1U << i); -+ } -+ -+ i++; -+ } -+ -+ if (reserve_sectors) { -+ ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, -+ m->data_opts.extra_replicas -+ ? 0 -+ : BCH_DISK_RESERVATION_NOFAIL); -+ if (ret) -+ goto err; -+ } -+ -+ m->op.nr_replicas += m->data_opts.extra_replicas; -+ m->op.nr_replicas_required = m->op.nr_replicas; -+ -+ BUG_ON(!m->op.nr_replicas); -+ -+ /* Special handling required: */ -+ if (bkey_extent_is_unwritten(k)) -+ return -BCH_ERR_unwritten_extent_update; -+ return 0; -+err: -+ i = 0; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if ((1U << i) & ptrs_locked) -+ bch2_bucket_nocow_unlock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, &p.ptr), 0); -+ percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); -+ i++; -+ } -+ -+ bch2_bkey_buf_exit(&m->k, c); -+ bch2_bio_free_pages_pool(c, &m->op.wbio.bio); -+ return ret; -+} -+ -+void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ unsigned i = 0; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { -+ opts->kill_ptrs |= 1U << i; -+ opts->rewrite_ptrs ^= 1U << i; -+ } -+ -+ i++; -+ } -+} -diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h -new file mode 100644 -index 000000000..49e9055cb ---- /dev/null -+++ b/fs/bcachefs/data_update.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _BCACHEFS_DATA_UPDATE_H -+#define _BCACHEFS_DATA_UPDATE_H -+ -+#include "bkey_buf.h" -+#include "io_types.h" -+ -+struct moving_context; -+ -+struct data_update_opts { -+ unsigned rewrite_ptrs; -+ unsigned kill_ptrs; -+ u16 target; -+ u8 extra_replicas; -+ unsigned btree_insert_flags; -+ unsigned write_flags; -+}; -+ -+struct data_update { -+ /* extent being updated: */ -+ enum btree_id btree_id; -+ struct bkey_buf k; -+ struct data_update_opts data_opts; -+ struct moving_context *ctxt; -+ struct bch_write_op op; -+}; -+ -+int bch2_data_update_index_update(struct bch_write_op *); -+ -+void bch2_data_update_read_done(struct data_update *, -+ struct bch_extent_crc_unpacked); -+ -+void bch2_data_update_exit(struct data_update *); -+void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); -+int bch2_data_update_init(struct btree_trans *, struct moving_context *, -+ struct data_update *, -+ struct write_point_specifier, -+ struct bch_io_opts, struct data_update_opts, -+ enum btree_id, struct bkey_s_c); -+void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); -+ -+#endif /* _BCACHEFS_DATA_UPDATE_H */ -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -new file mode 100644 -index 000000000..ae47e1854 ---- /dev/null -+++ b/fs/bcachefs/debug.c -@@ -0,0 +1,957 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Assorted bcachefs debug code -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "super.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+static struct dentry *bch_debug; -+ -+static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, -+ struct extent_ptr_decoded pick) -+{ -+ struct btree *v = c->verify_data; -+ struct btree_node *n_ondisk = c->verify_ondisk; -+ struct btree_node *n_sorted = c->verify_data->data; -+ struct bset *sorted, *inmemory = &b->data->keys; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ struct bio *bio; -+ bool failed = false, saw_error = false; -+ -+ if (!bch2_dev_get_ioref(ca, READ)) -+ return false; -+ -+ bio = bio_alloc_bioset(ca->disk_sb.bdev, -+ buf_pages(n_sorted, btree_bytes(c)), -+ REQ_OP_READ|REQ_META, -+ GFP_NOFS, -+ &c->btree_bio); -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bch2_bio_map(bio, n_sorted, btree_bytes(c)); -+ -+ submit_bio_wait(bio); -+ -+ bio_put(bio); -+ percpu_ref_put(&ca->io_ref); -+ -+ memcpy(n_ondisk, n_sorted, btree_bytes(c)); -+ -+ v->written = 0; -+ if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) -+ return false; -+ -+ n_sorted = c->verify_data->data; -+ sorted = &n_sorted->keys; -+ -+ if (inmemory->u64s != sorted->u64s || -+ memcmp(inmemory->start, -+ sorted->start, -+ vstruct_end(inmemory) - (void *) inmemory->start)) { -+ unsigned offset = 0, sectors; -+ struct bset *i; -+ unsigned j; -+ -+ console_lock(); -+ -+ printk(KERN_ERR "*** in memory:\n"); -+ bch2_dump_bset(c, b, inmemory, 0); -+ -+ printk(KERN_ERR "*** read back in:\n"); -+ bch2_dump_bset(c, v, sorted, 0); -+ -+ while (offset < v->written) { -+ if (!offset) { -+ i = &n_ondisk->keys; -+ sectors = vstruct_blocks(n_ondisk, c->block_bits) << -+ c->block_bits; -+ } else { -+ struct btree_node_entry *bne = -+ (void *) n_ondisk + (offset << 9); -+ i = &bne->keys; -+ -+ sectors = vstruct_blocks(bne, c->block_bits) << -+ c->block_bits; -+ } -+ -+ printk(KERN_ERR "*** on disk block %u:\n", offset); -+ bch2_dump_bset(c, b, i, offset); -+ -+ offset += sectors; -+ } -+ -+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) -+ if (inmemory->_data[j] != sorted->_data[j]) -+ break; -+ -+ console_unlock(); -+ bch_err(c, "verify failed at key %u", j); -+ -+ failed = true; -+ } -+ -+ if (v->written != b->written) { -+ bch_err(c, "written wrong: expected %u, got %u", -+ b->written, v->written); -+ failed = true; -+ } -+ -+ return failed; -+} -+ -+void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ struct bkey_ptrs_c ptrs; -+ struct extent_ptr_decoded p; -+ const union bch_extent_entry *entry; -+ struct btree *v; -+ struct bset *inmemory = &b->data->keys; -+ struct bkey_packed *k; -+ bool failed = false; -+ -+ if (c->opts.nochanges) -+ return; -+ -+ bch2_btree_node_io_lock(b); -+ mutex_lock(&c->verify_lock); -+ -+ if (!c->verify_ondisk) { -+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); -+ if (!c->verify_ondisk) -+ goto out; -+ } -+ -+ if (!c->verify_data) { -+ c->verify_data = __bch2_btree_node_mem_alloc(c); -+ if (!c->verify_data) -+ goto out; -+ -+ list_del_init(&c->verify_data->list); -+ } -+ -+ BUG_ON(b->nsets != 1); -+ -+ for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) -+ if (k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); -+ v->mem_ptr = 0; -+ } -+ -+ v = c->verify_data; -+ bkey_copy(&v->key, &b->key); -+ v->c.level = b->c.level; -+ v->c.btree_id = b->c.btree_id; -+ bch2_btree_keys_init(v); -+ -+ ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); -+ bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) -+ failed |= bch2_btree_verify_replica(c, b, p); -+ -+ if (failed) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); -+ bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); -+ printbuf_exit(&buf); -+ } -+out: -+ mutex_unlock(&c->verify_lock); -+ bch2_btree_node_io_unlock(b); -+} -+ -+void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct btree *b) -+{ -+ struct btree_node *n_ondisk = NULL; -+ struct extent_ptr_decoded pick; -+ struct bch_dev *ca; -+ struct bio *bio = NULL; -+ unsigned offset = 0; -+ int ret; -+ -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { -+ prt_printf(out, "error getting device to read from: invalid device\n"); -+ return; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ if (!bch2_dev_get_ioref(ca, READ)) { -+ prt_printf(out, "error getting device to read from: not online\n"); -+ return; -+ } -+ -+ n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); -+ if (!n_ondisk) { -+ prt_printf(out, "memory allocation failure\n"); -+ goto out; -+ } -+ -+ bio = bio_alloc_bioset(ca->disk_sb.bdev, -+ buf_pages(n_ondisk, btree_bytes(c)), -+ REQ_OP_READ|REQ_META, -+ GFP_NOFS, -+ &c->btree_bio); -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bch2_bio_map(bio, n_ondisk, btree_bytes(c)); -+ -+ ret = submit_bio_wait(bio); -+ if (ret) { -+ prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); -+ goto out; -+ } -+ -+ while (offset < btree_sectors(c)) { -+ struct bset *i; -+ struct nonce nonce; -+ struct bch_csum csum; -+ struct bkey_packed *k; -+ unsigned sectors; -+ -+ if (!offset) { -+ i = &n_ondisk->keys; -+ -+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { -+ prt_printf(out, "unknown checksum type at offset %u: %llu\n", -+ offset, BSET_CSUM_TYPE(i)); -+ goto out; -+ } -+ -+ nonce = btree_nonce(i, offset << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); -+ -+ if (bch2_crc_cmp(csum, n_ondisk->csum)) { -+ prt_printf(out, "invalid checksum\n"); -+ goto out; -+ } -+ -+ bset_encrypt(c, i, offset << 9); -+ -+ sectors = vstruct_sectors(n_ondisk, c->block_bits); -+ } else { -+ struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); -+ -+ i = &bne->keys; -+ -+ if (i->seq != n_ondisk->keys.seq) -+ break; -+ -+ if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { -+ prt_printf(out, "unknown checksum type at offset %u: %llu\n", -+ offset, BSET_CSUM_TYPE(i)); -+ goto out; -+ } -+ -+ nonce = btree_nonce(i, offset << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ if (bch2_crc_cmp(csum, bne->csum)) { -+ prt_printf(out, "invalid checksum"); -+ goto out; -+ } -+ -+ bset_encrypt(c, i, offset << 9); -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ prt_printf(out, " offset %u version %u, journal seq %llu\n", -+ offset, -+ le16_to_cpu(i->version), -+ le64_to_cpu(i->journal_seq)); -+ offset += sectors; -+ -+ printbuf_indent_add(out, 4); -+ -+ for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { -+ struct bkey u; -+ -+ bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); -+ prt_newline(out); -+ } -+ -+ printbuf_indent_sub(out, 4); -+ } -+out: -+ if (bio) -+ bio_put(bio); -+ kvpfree(n_ondisk, btree_bytes(c)); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+#ifdef CONFIG_DEBUG_FS -+ -+/* XXX: bch_fs refcounting */ -+ -+struct dump_iter { -+ struct bch_fs *c; -+ enum btree_id id; -+ struct bpos from; -+ struct bpos prev_node; -+ u64 iter; -+ -+ struct printbuf buf; -+ -+ char __user *ubuf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+static ssize_t flush_buf(struct dump_iter *i) -+{ -+ if (i->buf.pos) { -+ size_t bytes = min_t(size_t, i->buf.pos, i->size); -+ int err = copy_to_user(i->ubuf, i->buf.buf, bytes); -+ -+ if (err) -+ return err; -+ -+ i->ret += bytes; -+ i->ubuf += bytes; -+ i->size -= bytes; -+ i->buf.pos -= bytes; -+ memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); -+ } -+ -+ return i->size ? 0 : i->ret; -+} -+ -+static int bch2_dump_open(struct inode *inode, struct file *file) -+{ -+ struct btree_debug *bd = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ if (!i) -+ return -ENOMEM; -+ -+ file->private_data = i; -+ i->from = POS_MIN; -+ i->iter = 0; -+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); -+ i->id = bd->id; -+ i->buf = PRINTBUF; -+ -+ return 0; -+} -+ -+static int bch2_dump_release(struct inode *inode, struct file *file) -+{ -+ struct dump_iter *i = file->private_data; -+ -+ printbuf_exit(&i->buf); -+ kfree(i); -+ return 0; -+} -+ -+static ssize_t bch2_read_btree(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ ssize_t ret; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ ret = flush_buf(i); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ ret = for_each_btree_key2(&trans, iter, i->id, i->from, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ({ -+ bch2_bkey_val_to_text(&i->buf, i->c, k); -+ prt_newline(&i->buf); -+ drop_locks_do(&trans, flush_buf(i)); -+ })); -+ i->from = iter.pos; -+ -+ bch2_trans_exit(&trans); -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations btree_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree, -+}; -+ -+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct btree *b; -+ ssize_t ret; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ ret = flush_buf(i); -+ if (ret) -+ return ret; -+ -+ if (bpos_eq(SPOS_MAX, i->from)) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { -+ bch2_btree_node_to_text(&i->buf, i->c, b); -+ i->from = !bpos_eq(SPOS_MAX, b->key.k.p) -+ ? bpos_successor(b->key.k.p) -+ : b->key.k.p; -+ -+ ret = drop_locks_do(&trans, flush_buf(i)); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations btree_format_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree_formats, -+}; -+ -+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ ssize_t ret; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ ret = flush_buf(i); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ ret = for_each_btree_key2(&trans, iter, i->id, i->from, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ({ -+ struct btree_path_level *l = &iter.path->l[0]; -+ struct bkey_packed *_k = -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+ -+ if (bpos_gt(l->b->key.k.p, i->prev_node)) { -+ bch2_btree_node_to_text(&i->buf, i->c, l->b); -+ i->prev_node = l->b->key.k.p; -+ } -+ -+ bch2_bfloat_to_text(&i->buf, l->b, _k); -+ drop_locks_do(&trans, flush_buf(i)); -+ })); -+ i->from = iter.pos; -+ -+ bch2_trans_exit(&trans); -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations bfloat_failed_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_bfloat_failed, -+}; -+ -+static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 32); -+ -+ prt_printf(out, "%px btree=%s l=%u ", -+ b, -+ bch2_btree_ids[b->c.btree_id], -+ b->c.level); -+ prt_newline(out); -+ -+ printbuf_indent_add(out, 2); -+ -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ prt_newline(out); -+ -+ prt_printf(out, "flags: "); -+ prt_tab(out); -+ prt_bitflags(out, bch2_btree_node_flags, b->flags); -+ prt_newline(out); -+ -+ prt_printf(out, "pcpu read locks: "); -+ prt_tab(out); -+ prt_printf(out, "%u", b->c.lock.readers != NULL); -+ prt_newline(out); -+ -+ prt_printf(out, "written:"); -+ prt_tab(out); -+ prt_printf(out, "%u", b->written); -+ prt_newline(out); -+ -+ prt_printf(out, "writes blocked:"); -+ prt_tab(out); -+ prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); -+ prt_newline(out); -+ -+ prt_printf(out, "will make reachable:"); -+ prt_tab(out); -+ prt_printf(out, "%lx", b->will_make_reachable); -+ prt_newline(out); -+ -+ prt_printf(out, "journal pin %px:", &b->writes[0].journal); -+ prt_tab(out); -+ prt_printf(out, "%llu", b->writes[0].journal.seq); -+ prt_newline(out); -+ -+ prt_printf(out, "journal pin %px:", &b->writes[1].journal); -+ prt_tab(out); -+ prt_printf(out, "%llu", b->writes[1].journal.seq); -+ prt_newline(out); -+ -+ printbuf_indent_sub(out, 2); -+} -+ -+static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct bch_fs *c = i->c; -+ bool done = false; -+ ssize_t ret = 0; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ do { -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ -+ ret = flush_buf(i); -+ if (ret) -+ return ret; -+ -+ rcu_read_lock(); -+ i->buf.atomic++; -+ tbl = rht_dereference_rcu(c->btree_cache.table.tbl, -+ &c->btree_cache.table); -+ if (i->iter < tbl->size) { -+ rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) -+ bch2_cached_btree_node_to_text(&i->buf, c, b); -+ i->iter++; -+ } else { -+ done = true; -+ } -+ --i->buf.atomic; -+ rcu_read_unlock(); -+ } while (!done); -+ -+ if (i->buf.allocation_failure) -+ ret = -ENOMEM; -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations cached_btree_nodes_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_cached_btree_nodes_read, -+}; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS -+static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct bch_fs *c = i->c; -+ struct btree_trans *trans; -+ ssize_t ret = 0; -+ u32 seq; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+restart: -+ seqmutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ if (trans->locking_wait.task->pid <= i->iter) -+ continue; -+ -+ closure_get(&trans->ref); -+ seq = seqmutex_seq(&c->btree_trans_lock); -+ seqmutex_unlock(&c->btree_trans_lock); -+ -+ ret = flush_buf(i); -+ if (ret) { -+ closure_put(&trans->ref); -+ goto unlocked; -+ } -+ -+ bch2_btree_trans_to_text(&i->buf, trans); -+ -+ prt_printf(&i->buf, "backtrace:"); -+ prt_newline(&i->buf); -+ printbuf_indent_add(&i->buf, 2); -+ bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); -+ printbuf_indent_sub(&i->buf, 2); -+ prt_newline(&i->buf); -+ -+ i->iter = trans->locking_wait.task->pid; -+ -+ closure_put(&trans->ref); -+ -+ if (!seqmutex_relock(&c->btree_trans_lock, seq)) -+ goto restart; -+ } -+ seqmutex_unlock(&c->btree_trans_lock); -+unlocked: -+ if (i->buf.allocation_failure) -+ ret = -ENOMEM; -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations btree_transactions_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_btree_transactions_read, -+}; -+#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ -+ -+static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct bch_fs *c = i->c; -+ bool done = false; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ do { -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ break; -+ -+ done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); -+ i->iter++; -+ } while (!done); -+ -+ if (i->buf.allocation_failure) -+ return -ENOMEM; -+ -+ return i->ret; -+} -+ -+static const struct file_operations journal_pins_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_journal_pins_read, -+}; -+ -+static int lock_held_stats_open(struct inode *inode, struct file *file) -+{ -+ struct bch_fs *c = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ -+ if (!i) -+ return -ENOMEM; -+ -+ i->iter = 0; -+ i->c = c; -+ i->buf = PRINTBUF; -+ file->private_data = i; -+ -+ return 0; -+} -+ -+static int lock_held_stats_release(struct inode *inode, struct file *file) -+{ -+ struct dump_iter *i = file->private_data; -+ -+ printbuf_exit(&i->buf); -+ kfree(i); -+ -+ return 0; -+} -+ -+static ssize_t lock_held_stats_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct bch_fs *c = i->c; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ while (1) { -+ struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ break; -+ -+ if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || -+ !bch2_btree_transaction_fns[i->iter]) -+ break; -+ -+ prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); -+ prt_newline(&i->buf); -+ printbuf_indent_add(&i->buf, 2); -+ -+ mutex_lock(&s->lock); -+ -+ prt_printf(&i->buf, "Max mem used: %u", s->max_mem); -+ prt_newline(&i->buf); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { -+ prt_printf(&i->buf, "Lock hold times:"); -+ prt_newline(&i->buf); -+ -+ printbuf_indent_add(&i->buf, 2); -+ bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); -+ printbuf_indent_sub(&i->buf, 2); -+ } -+ -+ if (s->max_paths_text) { -+ prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); -+ prt_newline(&i->buf); -+ -+ printbuf_indent_add(&i->buf, 2); -+ prt_str_indented(&i->buf, s->max_paths_text); -+ printbuf_indent_sub(&i->buf, 2); -+ } -+ -+ mutex_unlock(&s->lock); -+ -+ printbuf_indent_sub(&i->buf, 2); -+ prt_newline(&i->buf); -+ i->iter++; -+ } -+ -+ if (i->buf.allocation_failure) -+ return -ENOMEM; -+ -+ return i->ret; -+} -+ -+static const struct file_operations lock_held_stats_op = { -+ .owner = THIS_MODULE, -+ .open = lock_held_stats_open, -+ .release = lock_held_stats_release, -+ .read = lock_held_stats_read, -+}; -+ -+static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct bch_fs *c = i->c; -+ struct btree_trans *trans; -+ ssize_t ret = 0; -+ u32 seq; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ if (i->iter) -+ goto out; -+restart: -+ seqmutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ if (trans->locking_wait.task->pid <= i->iter) -+ continue; -+ -+ closure_get(&trans->ref); -+ seq = seqmutex_seq(&c->btree_trans_lock); -+ seqmutex_unlock(&c->btree_trans_lock); -+ -+ ret = flush_buf(i); -+ if (ret) { -+ closure_put(&trans->ref); -+ goto out; -+ } -+ -+ bch2_check_for_deadlock(trans, &i->buf); -+ -+ i->iter = trans->locking_wait.task->pid; -+ -+ closure_put(&trans->ref); -+ -+ if (!seqmutex_relock(&c->btree_trans_lock, seq)) -+ goto restart; -+ } -+ seqmutex_unlock(&c->btree_trans_lock); -+out: -+ if (i->buf.allocation_failure) -+ ret = -ENOMEM; -+ -+ if (!ret) -+ ret = flush_buf(i); -+ -+ return ret ?: i->ret; -+} -+ -+static const struct file_operations btree_deadlock_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_btree_deadlock_read, -+}; -+ -+void bch2_fs_debug_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->fs_debug_dir)) -+ debugfs_remove_recursive(c->fs_debug_dir); -+} -+ -+void bch2_fs_debug_init(struct bch_fs *c) -+{ -+ struct btree_debug *bd; -+ char name[100]; -+ -+ if (IS_ERR_OR_NULL(bch_debug)) -+ return; -+ -+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ c->fs_debug_dir = debugfs_create_dir(name, bch_debug); -+ if (IS_ERR_OR_NULL(c->fs_debug_dir)) -+ return; -+ -+ debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, -+ c->btree_debug, &cached_btree_nodes_ops); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS -+ debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, -+ c->btree_debug, &btree_transactions_ops); -+#endif -+ -+ debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, -+ c->btree_debug, &journal_pins_ops); -+ -+ debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, -+ c, &lock_held_stats_op); -+ -+ debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, -+ c->btree_debug, &btree_deadlock_ops); -+ -+ c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); -+ if (IS_ERR_OR_NULL(c->btree_debug_dir)) -+ return; -+ -+ for (bd = c->btree_debug; -+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); -+ bd++) { -+ bd->id = bd - c->btree_debug; -+ debugfs_create_file(bch2_btree_ids[bd->id], -+ 0400, c->btree_debug_dir, bd, -+ &btree_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-formats", -+ bch2_btree_ids[bd->id]); -+ -+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, -+ &btree_format_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-bfloat-failed", -+ bch2_btree_ids[bd->id]); -+ -+ debugfs_create_file(name, 0400, c->btree_debug_dir, bd, -+ &bfloat_failed_debug_ops); -+ } -+} -+ -+#endif -+ -+void bch2_debug_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_debug)) -+ debugfs_remove_recursive(bch_debug); -+} -+ -+int __init bch2_debug_init(void) -+{ -+ int ret = 0; -+ -+ bch_debug = debugfs_create_dir("bcachefs", NULL); -+ return ret; -+} -diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h -new file mode 100644 -index 000000000..2c37143b5 ---- /dev/null -+++ b/fs/bcachefs/debug.h -@@ -0,0 +1,32 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DEBUG_H -+#define _BCACHEFS_DEBUG_H -+ -+#include "bcachefs.h" -+ -+struct bio; -+struct btree; -+struct bch_fs; -+ -+void __bch2_btree_verify(struct bch_fs *, struct btree *); -+void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, -+ const struct btree *); -+ -+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ if (bch2_verify_btree_ondisk) -+ __bch2_btree_verify(c, b); -+} -+ -+#ifdef CONFIG_DEBUG_FS -+void bch2_fs_debug_exit(struct bch_fs *); -+void bch2_fs_debug_init(struct bch_fs *); -+#else -+static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -+static inline void bch2_fs_debug_init(struct bch_fs *c) {} -+#endif -+ -+void bch2_debug_exit(void); -+int bch2_debug_init(void); -+ -+#endif /* _BCACHEFS_DEBUG_H */ -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -new file mode 100644 -index 000000000..a7559ab03 ---- /dev/null -+++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,590 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "dirent.h" -+#include "fs.h" -+#include "keylist.h" -+#include "str_hash.h" -+#include "subvolume.h" -+ -+#include -+ -+static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -+{ -+ unsigned bkey_u64s = bkey_val_u64s(d.k); -+ unsigned bkey_bytes = bkey_u64s * sizeof(u64); -+ u64 last_u64 = ((u64*)d.v)[bkey_u64s - 1]; -+#if CPU_BIG_ENDIAN -+ unsigned trailing_nuls = last_u64 ? __builtin_ctzll(last_u64) / 8 : 64 / 8; -+#else -+ unsigned trailing_nuls = last_u64 ? __builtin_clzll(last_u64) / 8 : 64 / 8; -+#endif -+ -+ return bkey_bytes - -+ offsetof(struct bch_dirent, d_name) - -+ trailing_nuls; -+} -+ -+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) -+{ -+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+} -+ -+static u64 bch2_dirent_hash(const struct bch_hash_info *info, -+ const struct qstr *name) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, name->name, name->len); -+ -+ /* [0,2) reserved for dots */ -+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -+} -+ -+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_dirent_hash(info, key); -+} -+ -+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr name = bch2_dirent_get_name(d); -+ -+ return bch2_dirent_hash(info, &name); -+} -+ -+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ const struct qstr l_name = bch2_dirent_get_name(l); -+ const struct qstr *r_name = _r; -+ -+ return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len); -+} -+ -+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -+ const struct qstr l_name = bch2_dirent_get_name(l); -+ const struct qstr r_name = bch2_dirent_get_name(r); -+ -+ return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len); -+} -+ -+static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ -+ if (d.v->d_type == DT_SUBVOL) -+ return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; -+ return true; -+} -+ -+const struct bch_hash_desc bch2_dirent_hash_desc = { -+ .btree_id = BTREE_ID_dirents, -+ .key_type = KEY_TYPE_dirent, -+ .hash_key = dirent_hash_key, -+ .hash_bkey = dirent_hash_bkey, -+ .cmp_key = dirent_cmp_key, -+ .cmp_bkey = dirent_cmp_bkey, -+ .is_visible = dirent_is_visible, -+}; -+ -+int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr d_name = bch2_dirent_get_name(d); -+ -+ if (!d_name.len) { -+ prt_printf(err, "empty name"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len)) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ /* -+ * Check new keys don't exceed the max length -+ * (older keys may be larger.) -+ */ -+ if ((flags & BKEY_INVALID_COMMIT) && d_name.len > BCH_NAME_MAX) { -+ prt_printf(err, "dirent name too big (%u > %u)", -+ d_name.len, BCH_NAME_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (d_name.len != strnlen(d_name.name, d_name.len)) { -+ prt_printf(err, "dirent has stray data after name's NUL"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (d_name.len == 1 && !memcmp(d_name.name, ".", 1)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (d_name.len == 2 && !memcmp(d_name.name, "..", 2)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (memchr(d_name.name, '/', d_name.len)) { -+ prt_printf(err, "invalid name"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (d.v->d_type != DT_SUBVOL && -+ le64_to_cpu(d.v->d_inum) == d.k->p.inode) { -+ prt_printf(err, "dirent points to own directory"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr d_name = bch2_dirent_get_name(d); -+ -+ prt_printf(out, "%.*s -> %llu type %s", -+ d_name.len, -+ d_name.name, -+ d.v->d_type != DT_SUBVOL -+ ? le64_to_cpu(d.v->d_inum) -+ : le32_to_cpu(d.v->d_child_subvol), -+ bch2_d_type_str(d.v->d_type)); -+} -+ -+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -+ subvol_inum dir, u8 type, -+ const struct qstr *name, u64 dst) -+{ -+ struct bkey_i_dirent *dirent; -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); -+ -+ if (name->len > BCH_NAME_MAX) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ BUG_ON(u64s > U8_MAX); -+ -+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(dirent)) -+ return dirent; -+ -+ bkey_dirent_init(&dirent->k_i); -+ dirent->k.u64s = u64s; -+ -+ if (type != DT_SUBVOL) { -+ dirent->v.d_inum = cpu_to_le64(dst); -+ } else { -+ dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); -+ dirent->v.d_child_subvol = cpu_to_le32(dst); -+ } -+ -+ dirent->v.d_type = type; -+ -+ memcpy(dirent->v.d_name, name->name, name->len); -+ memset(dirent->v.d_name + name->len, 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_name) - -+ name->len); -+ -+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); -+ -+ return dirent; -+} -+ -+int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, -+ const struct bch_hash_info *hash_info, -+ u8 type, const struct qstr *name, u64 dst_inum, -+ u64 *dir_offset, int flags) -+{ -+ struct bkey_i_dirent *dirent; -+ int ret; -+ -+ dirent = dirent_create_key(trans, dir, type, name, dst_inum); -+ ret = PTR_ERR_OR_ZERO(dirent); -+ if (ret) -+ return ret; -+ -+ ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, -+ dir, &dirent->k_i, flags); -+ *dir_offset = dirent->k.p.offset; -+ -+ return ret; -+} -+ -+static void dirent_copy_target(struct bkey_i_dirent *dst, -+ struct bkey_s_c_dirent src) -+{ -+ dst->v.d_inum = src.v->d_inum; -+ dst->v.d_type = src.v->d_type; -+} -+ -+int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, -+ struct bkey_s_c_dirent d, subvol_inum *target) -+{ -+ struct bch_subvolume s; -+ int ret = 0; -+ -+ if (d.v->d_type == DT_SUBVOL && -+ le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) -+ return 1; -+ -+ if (likely(d.v->d_type != DT_SUBVOL)) { -+ target->subvol = dir.subvol; -+ target->inum = le64_to_cpu(d.v->d_inum); -+ } else { -+ target->subvol = le32_to_cpu(d.v->d_child_subvol); -+ -+ ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); -+ -+ target->inum = le64_to_cpu(s.inode); -+ } -+ -+ return ret; -+} -+ -+int bch2_dirent_rename(struct btree_trans *trans, -+ subvol_inum src_dir, struct bch_hash_info *src_hash, -+ subvol_inum dst_dir, struct bch_hash_info *dst_hash, -+ const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, -+ const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter src_iter = { NULL }; -+ struct btree_iter dst_iter = { NULL }; -+ struct bkey_s_c old_src, old_dst = bkey_s_c_null; -+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; -+ struct bpos dst_pos = -+ POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); -+ unsigned src_type = 0, dst_type = 0, src_update_flags = 0; -+ int ret = 0; -+ -+ if (src_dir.subvol != dst_dir.subvol) -+ return -EXDEV; -+ -+ memset(src_inum, 0, sizeof(*src_inum)); -+ memset(dst_inum, 0, sizeof(*dst_inum)); -+ -+ /* Lookup src: */ -+ ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, -+ src_hash, src_dir, src_name, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto out; -+ -+ old_src = bch2_btree_iter_peek_slot(&src_iter); -+ ret = bkey_err(old_src); -+ if (ret) -+ goto out; -+ -+ ret = bch2_dirent_read_target(trans, src_dir, -+ bkey_s_c_to_dirent(old_src), src_inum); -+ if (ret) -+ goto out; -+ -+ src_type = bkey_s_c_to_dirent(old_src).v->d_type; -+ -+ if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) -+ return -EOPNOTSUPP; -+ -+ -+ /* Lookup dst: */ -+ if (mode == BCH_RENAME) { -+ /* -+ * Note that we're _not_ checking if the target already exists - -+ * we're relying on the VFS to do that check for us for -+ * correctness: -+ */ -+ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name); -+ if (ret) -+ goto out; -+ } else { -+ ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto out; -+ -+ old_dst = bch2_btree_iter_peek_slot(&dst_iter); -+ ret = bkey_err(old_dst); -+ if (ret) -+ goto out; -+ -+ ret = bch2_dirent_read_target(trans, dst_dir, -+ bkey_s_c_to_dirent(old_dst), dst_inum); -+ if (ret) -+ goto out; -+ -+ dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; -+ -+ if (dst_type == DT_SUBVOL) -+ return -EOPNOTSUPP; -+ } -+ -+ if (mode != BCH_RENAME_EXCHANGE) -+ *src_offset = dst_iter.pos.offset; -+ -+ /* Create new dst key: */ -+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_dst); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); -+ new_dst->k.p = dst_iter.pos; -+ -+ /* Create new src key: */ -+ if (mode == BCH_RENAME_EXCHANGE) { -+ new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); -+ new_src->k.p = src_iter.pos; -+ } else { -+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ bkey_init(&new_src->k); -+ new_src->k.p = src_iter.pos; -+ -+ if (bkey_le(dst_pos, src_iter.pos) && -+ bkey_lt(src_iter.pos, dst_iter.pos)) { -+ /* -+ * We have a hash collision for the new dst key, -+ * and new_src - the key we're deleting - is between -+ * new_dst's hashed slot and the slot we're going to be -+ * inserting it into - oops. This will break the hash -+ * table if we don't deal with it: -+ */ -+ if (mode == BCH_RENAME) { -+ /* -+ * If we're not overwriting, we can just insert -+ * new_dst at the src position: -+ */ -+ new_src = new_dst; -+ new_src->k.p = src_iter.pos; -+ goto out_set_src; -+ } else { -+ /* If we're overwriting, we can't insert new_dst -+ * at a different slot because it has to -+ * overwrite old_dst - just make sure to use a -+ * whiteout when deleting src: -+ */ -+ new_src->k.type = KEY_TYPE_hash_whiteout; -+ } -+ } else { -+ /* Check if we need a whiteout to delete src: */ -+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, -+ src_hash, &src_iter); -+ if (ret < 0) -+ goto out; -+ -+ if (ret) -+ new_src->k.type = KEY_TYPE_hash_whiteout; -+ } -+ } -+ -+ ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); -+ if (ret) -+ goto out; -+out_set_src: -+ -+ /* -+ * If we're deleting a subvolume, we need to really delete the dirent, -+ * not just emit a whiteout in the current snapshot: -+ */ -+ if (src_type == DT_SUBVOL) { -+ bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); -+ ret = bch2_btree_iter_traverse(&src_iter); -+ if (ret) -+ goto out; -+ -+ new_src->k.p = src_iter.pos; -+ src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; -+ } -+ -+ ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); -+ if (ret) -+ goto out; -+ -+ if (mode == BCH_RENAME_EXCHANGE) -+ *src_offset = new_src->k.p.offset; -+ *dst_offset = new_dst->k.p.offset; -+out: -+ bch2_trans_iter_exit(trans, &src_iter); -+ bch2_trans_iter_exit(trans, &dst_iter); -+ return ret; -+} -+ -+int __bch2_dirent_lookup_trans(struct btree_trans *trans, -+ struct btree_iter *iter, -+ subvol_inum dir, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name, subvol_inum *inum, -+ unsigned flags) -+{ -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, -+ hash_info, dir, name, flags); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ d = bkey_s_c_to_dirent(k); -+ -+ ret = bch2_dirent_read_target(trans, dir, d, inum); -+ if (ret > 0) -+ ret = -ENOENT; -+err: -+ if (ret) -+ bch2_trans_iter_exit(trans, iter); -+ -+ return ret; -+} -+ -+u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name, subvol_inum *inum) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, -+ name, inum, 0); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ if (!ret) -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, -+ SPOS(dir.inum, 0, snapshot), -+ POS(dir.inum, U64_MAX), 0, k, ret) -+ if (k.k->type == KEY_TYPE_dirent) { -+ ret = -ENOTEMPTY; -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ subvol_inum target; -+ u32 snapshot; -+ struct bkey_buf sk; -+ struct qstr name; -+ int ret; -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, -+ SPOS(inum.inum, ctx->pos, snapshot), -+ POS(inum.inum, U64_MAX), 0, k, ret) { -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ ret = bch2_dirent_read_target(&trans, inum, dirent, &target); -+ if (ret < 0) -+ break; -+ if (ret) -+ continue; -+ -+ /* dir_emit() can fault and block: */ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ dirent = bkey_i_to_s_c_dirent(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ name = bch2_dirent_get_name(dirent); -+ -+ ctx->pos = dirent.k->p.offset; -+ if (!dir_emit(ctx, name.name, -+ name.len, -+ target.inum, -+ vfs_d_type(dirent.v->d_type))) -+ break; -+ ctx->pos = dirent.k->p.offset + 1; -+ -+ /* -+ * read_target looks up subvolumes, we can overflow paths if the -+ * directory has many subvolumes in it -+ */ -+ ret = btree_trans_too_many_iters(&trans); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -new file mode 100644 -index 000000000..e9fa1df38 ---- /dev/null -+++ b/fs/bcachefs/dirent.h -@@ -0,0 +1,70 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DIRENT_H -+#define _BCACHEFS_DIRENT_H -+ -+#include "str_hash.h" -+ -+enum bkey_invalid_flags; -+extern const struct bch_hash_desc bch2_dirent_hash_desc; -+ -+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ -+ .key_invalid = bch2_dirent_invalid, \ -+ .val_to_text = bch2_dirent_to_text, \ -+ .min_val_size = 16, \ -+}) -+ -+struct qstr; -+struct file; -+struct dir_context; -+struct bch_fs; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); -+ -+static inline unsigned dirent_val_u64s(unsigned len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, -+ sizeof(u64)); -+} -+ -+int bch2_dirent_read_target(struct btree_trans *, subvol_inum, -+ struct bkey_s_c_dirent, subvol_inum *); -+ -+int bch2_dirent_create(struct btree_trans *, subvol_inum, -+ const struct bch_hash_info *, u8, -+ const struct qstr *, u64, u64 *, int); -+ -+static inline unsigned vfs_d_type(unsigned type) -+{ -+ return type == DT_SUBVOL ? DT_DIR : type; -+} -+ -+enum bch_rename_mode { -+ BCH_RENAME, -+ BCH_RENAME_OVERWRITE, -+ BCH_RENAME_EXCHANGE, -+}; -+ -+int bch2_dirent_rename(struct btree_trans *, -+ subvol_inum, struct bch_hash_info *, -+ subvol_inum, struct bch_hash_info *, -+ const struct qstr *, subvol_inum *, u64 *, -+ const struct qstr *, subvol_inum *, u64 *, -+ enum bch_rename_mode); -+ -+int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, -+ subvol_inum, const struct bch_hash_info *, -+ const struct qstr *, subvol_inum *, unsigned); -+u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, -+ const struct bch_hash_info *, -+ const struct qstr *, subvol_inum *); -+ -+int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -+int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); -+ -+#endif /* _BCACHEFS_DIRENT_H */ -diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c -new file mode 100644 -index 000000000..f36472c4a ---- /dev/null -+++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,556 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "disk_groups.h" -+#include "sb-members.h" -+#include "super-io.h" -+ -+#include -+ -+static int group_cmp(const void *_l, const void *_r) -+{ -+ const struct bch_disk_group *l = _l; -+ const struct bch_disk_group *r = _r; -+ -+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - -+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: -+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - -+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: -+ strncmp(l->label, r->label, sizeof(l->label)); -+} -+ -+static int bch2_sb_disk_groups_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g, *sorted = NULL; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ unsigned nr_groups = disk_groups_nr(groups); -+ unsigned i, len; -+ int ret = 0; -+ -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ unsigned g; -+ -+ if (!BCH_MEMBER_GROUP(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (g >= nr_groups) { -+ prt_printf(err, "disk %u has invalid label %u (have %u)", -+ i, g, nr_groups); -+ return -BCH_ERR_invalid_sb_disk_groups; -+ } -+ -+ if (BCH_GROUP_DELETED(&groups->entries[g])) { -+ prt_printf(err, "disk %u has deleted label %u", i, g); -+ return -BCH_ERR_invalid_sb_disk_groups; -+ } -+ } -+ -+ if (!nr_groups) -+ return 0; -+ -+ for (i = 0; i < nr_groups; i++) { -+ g = groups->entries + i; -+ -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ len = strnlen(g->label, sizeof(g->label)); -+ if (!len) { -+ prt_printf(err, "label %u empty", i); -+ return -BCH_ERR_invalid_sb_disk_groups; -+ } -+ } -+ -+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); -+ if (!sorted) -+ return -BCH_ERR_ENOMEM_disk_groups_validate; -+ -+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); -+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); -+ -+ for (g = sorted; g + 1 < sorted + nr_groups; g++) -+ if (!BCH_GROUP_DELETED(g) && -+ !group_cmp(&g[0], &g[1])) { -+ prt_printf(err, "duplicate label %llu.%.*s", -+ BCH_GROUP_PARENT(g), -+ (int) sizeof(g->label), g->label); -+ ret = -BCH_ERR_invalid_sb_disk_groups; -+ goto err; -+ } -+err: -+ kfree(sorted); -+ return ret; -+} -+ -+void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_disk_groups_cpu *g; -+ struct bch_dev *ca; -+ int i; -+ unsigned iter; -+ -+ out->atomic++; -+ rcu_read_lock(); -+ -+ g = rcu_dereference(c->disk_groups); -+ if (!g) -+ goto out; -+ -+ for (i = 0; i < g->nr; i++) { -+ if (i) -+ prt_printf(out, " "); -+ -+ if (g->entries[i].deleted) { -+ prt_printf(out, "[deleted]"); -+ continue; -+ } -+ -+ prt_printf(out, "[parent %d devs", g->entries[i].parent); -+ for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) -+ prt_printf(out, " %s", ca->name); -+ prt_printf(out, "]"); -+ } -+ -+out: -+ rcu_read_unlock(); -+ out->atomic--; -+} -+ -+static void bch2_sb_disk_groups_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g; -+ unsigned nr_groups = disk_groups_nr(groups); -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (g != groups->entries) -+ prt_printf(out, " "); -+ -+ if (BCH_GROUP_DELETED(g)) -+ prt_printf(out, "[deleted]"); -+ else -+ prt_printf(out, "[parent %llu name %s]", -+ BCH_GROUP_PARENT(g), g->label); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { -+ .validate = bch2_sb_disk_groups_validate, -+ .to_text = bch2_sb_disk_groups_to_text -+}; -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_sb_field_disk_groups *groups; -+ struct bch_disk_groups_cpu *cpu_g, *old_g; -+ unsigned i, g, nr_groups; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ if (!groups) -+ return 0; -+ -+ cpu_g = kzalloc(sizeof(*cpu_g) + -+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); -+ if (!cpu_g) -+ return -BCH_ERR_ENOMEM_disk_groups_to_cpu; -+ -+ cpu_g->nr = nr_groups; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *src = &groups->entries[i]; -+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; -+ -+ dst->deleted = BCH_GROUP_DELETED(src); -+ dst->parent = BCH_GROUP_PARENT(src); -+ } -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ struct bch_disk_group_cpu *dst = -+ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m); -+ while (g) { -+ dst = &cpu_g->entries[g - 1]; -+ __set_bit(i, dst->devs.d); -+ g = dst->parent; -+ } -+ } -+ -+ old_g = rcu_dereference_protected(c->disk_groups, -+ lockdep_is_held(&c->sb_lock)); -+ rcu_assign_pointer(c->disk_groups, cpu_g); -+ if (old_g) -+ kfree_rcu(old_g, rcu); -+ -+ return 0; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -+{ -+ struct target t = target_decode(target); -+ struct bch_devs_mask *devs; -+ -+ rcu_read_lock(); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ devs = NULL; -+ break; -+ case TARGET_DEV: { -+ struct bch_dev *ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ devs = ca ? &ca->self : NULL; -+ break; -+ } -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ -+ devs = g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ break; -+ } -+ default: -+ BUG(); -+ } -+ -+ rcu_read_unlock(); -+ -+ return devs; -+} -+ -+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return false; -+ case TARGET_DEV: -+ return dev == t.dev; -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g; -+ const struct bch_devs_mask *m; -+ bool ret; -+ -+ rcu_read_lock(); -+ g = rcu_dereference(c->disk_groups); -+ m = g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ -+ ret = m ? test_bit(dev, m->d) : false; -+ rcu_read_unlock(); -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, -+ unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *g = groups->entries + i; -+ -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ if (!BCH_GROUP_DELETED(g) && -+ BCH_GROUP_PARENT(g) == parent && -+ strnlen(g->label, sizeof(g->label)) == namelen && -+ !memcmp(name, g->label, namelen)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ struct bch_disk_group *g; -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; -+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); -+ i++) -+ ; -+ -+ if (i == nr_groups) { -+ unsigned u64s = -+ (sizeof(struct bch_sb_field_disk_groups) + -+ sizeof(struct bch_disk_group) * (nr_groups + 1)) / -+ sizeof(u64); -+ -+ groups = bch2_sb_resize_disk_groups(sb, u64s); -+ if (!groups) -+ return -BCH_ERR_ENOSPC_disk_label_add; -+ -+ nr_groups = disk_groups_nr(groups); -+ } -+ -+ BUG_ON(i >= nr_groups); -+ -+ g = &groups->entries[i]; -+ -+ memcpy(g->label, name, namelen); -+ if (namelen < sizeof(g->label)) -+ g->label[namelen] = '\0'; -+ SET_BCH_GROUP_DELETED(g, 0); -+ SET_BCH_GROUP_PARENT(g, parent); -+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); -+ -+ return i; -+} -+ -+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ v = __bch2_disk_group_find(groups, v + 1, name, len); -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups; -+ unsigned parent = 0; -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ groups = bch2_sb_get_disk_groups(sb->sb); -+ -+ v = __bch2_disk_group_find(groups, parent, name, len); -+ if (v < 0) -+ v = __bch2_disk_group_add(sb, parent, name, len); -+ if (v < 0) -+ return v; -+ -+ parent = v + 1; -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb); -+ struct bch_disk_group *g; -+ unsigned nr = 0; -+ u16 path[32]; -+ -+ while (1) { -+ if (nr == ARRAY_SIZE(path)) -+ goto inval; -+ -+ if (v >= disk_groups_nr(groups)) -+ goto inval; -+ -+ g = groups->entries + v; -+ -+ if (BCH_GROUP_DELETED(g)) -+ goto inval; -+ -+ path[nr++] = v; -+ -+ if (!BCH_GROUP_PARENT(g)) -+ break; -+ -+ v = BCH_GROUP_PARENT(g) - 1; -+ } -+ -+ while (nr) { -+ v = path[--nr]; -+ g = groups->entries + v; -+ -+ prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); -+ if (nr) -+ prt_printf(out, "."); -+ } -+ return; -+inval: -+ prt_printf(out, "invalid label %u", v); -+} -+ -+int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -+{ -+ struct bch_member *mi; -+ int ret, v = -1; -+ -+ if (!strlen(name) || !strcmp(name, "none")) -+ return 0; -+ -+ v = bch2_disk_path_find_or_create(&c->disk_sb, name); -+ if (v < 0) -+ return v; -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ return ret; -+ -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ SET_BCH_MEMBER_GROUP(mi, v + 1); -+ return 0; -+} -+ -+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -+{ -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ ret = __bch2_dev_group_set(c, ca, name) ?: -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, -+ struct printbuf *err) -+{ -+ struct bch_dev *ca; -+ int g; -+ -+ if (!val) -+ return -EINVAL; -+ -+ if (!c) -+ return 0; -+ -+ if (!strlen(val) || !strcmp(val, "none")) { -+ *res = 0; -+ return 0; -+ } -+ -+ /* Is it a device? */ -+ ca = bch2_dev_lookup(c, val); -+ if (!IS_ERR(ca)) { -+ *res = dev_to_target(ca->dev_idx); -+ percpu_ref_put(&ca->ref); -+ return 0; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ g = bch2_disk_path_find(&c->disk_sb, val); -+ mutex_unlock(&c->sb_lock); -+ -+ if (g >= 0) { -+ *res = group_to_target(g); -+ return 0; -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_opt_target_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_sb *sb, -+ u64 v) -+{ -+ struct target t = target_decode(v); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ prt_printf(out, "none"); -+ break; -+ case TARGET_DEV: -+ if (c) { -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ prt_printf(out, "offline device %u", t.dev); -+ } else { -+ prt_printf(out, "invalid device %u", t.dev); -+ } -+ -+ rcu_read_unlock(); -+ } else { -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_member *m = mi->members + t.dev; -+ -+ if (bch2_dev_exists(sb, mi, t.dev)) { -+ prt_printf(out, "Device "); -+ pr_uuid(out, m->uuid.b); -+ prt_printf(out, " (%u)", t.dev); -+ } else { -+ prt_printf(out, "Bad device %u", t.dev); -+ } -+ } -+ break; -+ case TARGET_GROUP: -+ if (c) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); -+ mutex_unlock(&c->sb_lock); -+ } else { -+ bch2_disk_path_to_text(out, sb, t.group); -+ } -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h -new file mode 100644 -index 000000000..bd7711767 ---- /dev/null -+++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,106 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DISK_GROUPS_H -+#define _BCACHEFS_DISK_GROUPS_H -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; -+ -+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -+{ -+ return groups -+ ? (vstruct_end(&groups->field) - -+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) -+ : 0; -+} -+ -+struct target { -+ enum { -+ TARGET_NULL, -+ TARGET_DEV, -+ TARGET_GROUP, -+ } type; -+ union { -+ unsigned dev; -+ unsigned group; -+ }; -+}; -+ -+#define TARGET_DEV_START 1 -+#define TARGET_GROUP_START (256 + TARGET_DEV_START) -+ -+static inline u16 dev_to_target(unsigned dev) -+{ -+ return TARGET_DEV_START + dev; -+} -+ -+static inline u16 group_to_target(unsigned group) -+{ -+ return TARGET_GROUP_START + group; -+} -+ -+static inline struct target target_decode(unsigned target) -+{ -+ if (target >= TARGET_GROUP_START) -+ return (struct target) { -+ .type = TARGET_GROUP, -+ .group = target - TARGET_GROUP_START -+ }; -+ -+ if (target >= TARGET_DEV_START) -+ return (struct target) { -+ .type = TARGET_DEV, -+ .group = target - TARGET_DEV_START -+ }; -+ -+ return (struct target) { .type = TARGET_NULL }; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); -+ -+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, -+ enum bch_data_type data_type, -+ u16 target) -+{ -+ struct bch_devs_mask devs = c->rw_devs[data_type]; -+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); -+ -+ if (t) -+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); -+ return devs; -+} -+ -+static inline bool bch2_target_accepts_data(struct bch_fs *c, -+ enum bch_data_type data_type, -+ u16 target) -+{ -+ struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); -+ return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); -+} -+ -+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); -+ -+int bch2_disk_path_find(struct bch_sb_handle *, const char *); -+ -+/* Exported for userspace bcachefs-tools: */ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -+ -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); -+ -+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); -+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -+ -+#define bch2_opt_target (struct bch_opt_fn) { \ -+ .parse = bch2_opt_target_parse, \ -+ .to_text = bch2_opt_target_to_text, \ -+} -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *); -+ -+int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -+ -+const char *bch2_sb_validate_disk_groups(struct bch_sb *, -+ struct bch_sb_field *); -+ -+void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); -+ -+#endif /* _BCACHEFS_DISK_GROUPS_H */ -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -new file mode 100644 -index 000000000..f58e84a2b ---- /dev/null -+++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1972 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+/* erasure coding */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "backpointers.h" -+#include "bkey_buf.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_write_buffer.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+#ifdef __KERNEL__ -+ -+#include -+#include -+ -+static void raid5_recov(unsigned disks, unsigned failed_idx, -+ size_t size, void **data) -+{ -+ unsigned i = 2, nr; -+ -+ BUG_ON(failed_idx >= disks); -+ -+ swap(data[0], data[failed_idx]); -+ memcpy(data[0], data[1], size); -+ -+ while (i < disks) { -+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); -+ xor_blocks(nr, size, data[0], data + i); -+ i += nr; -+ } -+ -+ swap(data[0], data[failed_idx]); -+} -+ -+static void raid_gen(int nd, int np, size_t size, void **v) -+{ -+ if (np >= 1) -+ raid5_recov(nd + np, nd, size, v); -+ if (np >= 2) -+ raid6_call.gen_syndrome(nd + np, size, v); -+ BUG_ON(np > 2); -+} -+ -+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -+{ -+ switch (nr) { -+ case 0: -+ break; -+ case 1: -+ if (ir[0] < nd + 1) -+ raid5_recov(nd + 1, ir[0], size, v); -+ else -+ raid6_call.gen_syndrome(nd + np, size, v); -+ break; -+ case 2: -+ if (ir[1] < nd) { -+ /* data+data failure. */ -+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); -+ } else if (ir[0] < nd) { -+ /* data + p/q failure */ -+ -+ if (ir[1] == nd) /* data + p failure */ -+ raid6_datap_recov(nd + np, size, ir[0], v); -+ else { /* data + q failure */ -+ raid5_recov(nd + 1, ir[0], size, v); -+ raid6_call.gen_syndrome(nd + np, size, v); -+ } -+ } else { -+ raid_gen(nd, np, size, v); -+ } -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+#else -+ -+#include -+ -+#endif -+ -+struct ec_bio { -+ struct bch_dev *ca; -+ struct ec_stripe_buf *buf; -+ size_t idx; -+ struct bio bio; -+}; -+ -+/* Stripes btree keys: */ -+ -+int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ -+ if (bkey_eq(k.k->p, POS_MIN)) { -+ prt_printf(err, "stripe at POS_MIN"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero inode field"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { -+ prt_printf(err, "incorrect value size (%zu < %u)", -+ bkey_val_u64s(k.k), stripe_val_u64s(s)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); -+} -+ -+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant; -+ -+ prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", -+ s->algorithm, -+ le16_to_cpu(s->sectors), -+ nr_data, -+ s->nr_redundant, -+ s->csum_type, -+ 1U << s->csum_granularity_bits); -+ -+ for (i = 0; i < s->nr_blocks; i++) { -+ const struct bch_extent_ptr *ptr = s->ptrs + i; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ u32 offset; -+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); -+ -+ prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); -+ if (i < nr_data) -+ prt_printf(out, "#%u", stripe_blockcount_get(s, i)); -+ if (ptr_stale(ca, ptr)) -+ prt_printf(out, " stale"); -+ } -+} -+ -+/* returns blocknr in stripe that we matched: */ -+static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, -+ struct bkey_s_c k, unsigned *block) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ unsigned i, nr_data = s->nr_blocks - s->nr_redundant; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ for (i = 0; i < nr_data; i++) -+ if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, -+ le16_to_cpu(s->sectors))) { -+ *block = i; -+ return ptr; -+ } -+ -+ return NULL; -+} -+ -+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ -+ extent_for_each_entry(e, entry) -+ if (extent_entry_type(entry) == -+ BCH_EXTENT_ENTRY_stripe_ptr && -+ entry->stripe_ptr.idx == idx) -+ return true; -+ -+ break; -+ } -+ } -+ -+ return false; -+} -+ -+/* Stripe bufs: */ -+ -+static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) -+{ -+ if (buf->key.k.type == KEY_TYPE_stripe) { -+ struct bkey_i_stripe *s = bkey_i_to_stripe(&buf->key); -+ unsigned i; -+ -+ for (i = 0; i < s->v.nr_blocks; i++) { -+ kvpfree(buf->data[i], buf->size << 9); -+ buf->data[i] = NULL; -+ } -+ } -+} -+ -+/* XXX: this is a non-mempoolified memory allocation: */ -+static int ec_stripe_buf_init(struct ec_stripe_buf *buf, -+ unsigned offset, unsigned size) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned csum_granularity = 1U << v->csum_granularity_bits; -+ unsigned end = offset + size; -+ unsigned i; -+ -+ BUG_ON(end > le16_to_cpu(v->sectors)); -+ -+ offset = round_down(offset, csum_granularity); -+ end = min_t(unsigned, le16_to_cpu(v->sectors), -+ round_up(end, csum_granularity)); -+ -+ buf->offset = offset; -+ buf->size = end - offset; -+ -+ memset(buf->valid, 0xFF, sizeof(buf->valid)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); -+ if (!buf->data[i]) -+ goto err; -+ } -+ -+ return 0; -+err: -+ ec_stripe_buf_exit(buf); -+ return -BCH_ERR_ENOMEM_stripe_buf; -+} -+ -+/* Checksumming: */ -+ -+static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, -+ unsigned block, unsigned offset) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned end = buf->offset + buf->size; -+ unsigned len = min(csum_granularity, end - offset); -+ -+ BUG_ON(offset >= end); -+ BUG_ON(offset < buf->offset); -+ BUG_ON(offset & (csum_granularity - 1)); -+ BUG_ON(offset + len != le16_to_cpu(v->sectors) && -+ (len & (csum_granularity - 1))); -+ -+ return bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[block] + ((offset - buf->offset) << 9), -+ len << 9); -+} -+ -+static void ec_generate_checksums(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned i, j, csums_per_device = stripe_csums_per_device(v); -+ -+ if (!v->csum_type) -+ return; -+ -+ BUG_ON(buf->offset); -+ BUG_ON(buf->size != le16_to_cpu(v->sectors)); -+ -+ for (i = 0; i < v->nr_blocks; i++) -+ for (j = 0; j < csums_per_device; j++) -+ stripe_csum_set(v, i, j, -+ ec_block_checksum(buf, i, j << v->csum_granularity_bits)); -+} -+ -+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned i; -+ -+ if (!v->csum_type) -+ return; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ unsigned offset = buf->offset; -+ unsigned end = buf->offset + buf->size; -+ -+ if (!test_bit(i, buf->valid)) -+ continue; -+ -+ while (offset < end) { -+ unsigned j = offset >> v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, end - offset); -+ struct bch_csum want = stripe_csum_get(v, i, j); -+ struct bch_csum got = ec_block_checksum(buf, i, offset); -+ -+ if (bch2_crc_cmp(want, got)) { -+ struct printbuf buf2 = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key)); -+ -+ bch_err_ratelimited(c, -+ "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", -+ (void *) _RET_IP_, i, j, v->csum_type, -+ want.lo, got.lo, buf2.buf); -+ printbuf_exit(&buf2); -+ clear_bit(i, buf->valid); -+ break; -+ } -+ -+ offset += len; -+ } -+ } -+} -+ -+/* Erasure coding: */ -+ -+static void ec_generate_ec(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = le16_to_cpu(v->sectors) << 9; -+ -+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -+} -+ -+static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ -+ return v->nr_blocks - bitmap_weight(buf->valid, v->nr_blocks); -+} -+ -+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = buf->size << 9; -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ bch_err_ratelimited(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ return -1; -+ } -+ -+ for (i = 0; i < nr_data; i++) -+ if (!test_bit(i, buf->valid)) -+ failed[nr_failed++] = i; -+ -+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); -+ return 0; -+} -+ -+/* IO: */ -+ -+static void ec_block_endio(struct bio *bio) -+{ -+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); -+ struct bch_stripe *v = &bkey_i_to_stripe(&ec_bio->buf->key)->v; -+ struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; -+ struct bch_dev *ca = ec_bio->ca; -+ struct closure *cl = bio->bi_private; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", -+ bio_data_dir(bio) ? "write" : "read", -+ bch2_blk_status_to_str(bio->bi_status))) -+ clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ -+ if (ptr_stale(ca, ptr)) { -+ bch_err_ratelimited(ca->fs, -+ "error %s stripe: stale pointer after io", -+ bio_data_dir(bio) == READ ? "reading from" : "writing to"); -+ clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ } -+ -+ bio_put(&ec_bio->bio); -+ percpu_ref_put(&ca->io_ref); -+ closure_put(cl); -+} -+ -+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, -+ blk_opf_t opf, unsigned idx, struct closure *cl) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; -+ unsigned offset = 0, bytes = buf->size << 9; -+ struct bch_extent_ptr *ptr = &v->ptrs[idx]; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ enum bch_data_type data_type = idx < v->nr_blocks - v->nr_redundant -+ ? BCH_DATA_user -+ : BCH_DATA_parity; -+ int rw = op_is_write(opf); -+ -+ if (ptr_stale(ca, ptr)) { -+ bch_err_ratelimited(c, -+ "error %s stripe: stale pointer", -+ rw == READ ? "reading from" : "writing to"); -+ clear_bit(idx, buf->valid); -+ return; -+ } -+ -+ if (!bch2_dev_get_ioref(ca, rw)) { -+ clear_bit(idx, buf->valid); -+ return; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); -+ -+ while (offset < bytes) { -+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, -+ DIV_ROUND_UP(bytes, PAGE_SIZE)); -+ unsigned b = min_t(size_t, bytes - offset, -+ nr_iovecs << PAGE_SHIFT); -+ struct ec_bio *ec_bio; -+ -+ ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, -+ nr_iovecs, -+ opf, -+ GFP_KERNEL, -+ &c->ec_bioset), -+ struct ec_bio, bio); -+ -+ ec_bio->ca = ca; -+ ec_bio->buf = buf; -+ ec_bio->idx = idx; -+ -+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); -+ ec_bio->bio.bi_end_io = ec_block_endio; -+ ec_bio->bio.bi_private = cl; -+ -+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); -+ -+ closure_get(cl); -+ percpu_ref_get(&ca->io_ref); -+ -+ submit_bio(&ec_bio->bio); -+ -+ offset += b; -+ } -+ -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, -+ struct ec_stripe_buf *stripe) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, -+ POS(0, idx), BTREE_ITER_SLOTS); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ if (k.k->type != KEY_TYPE_stripe) { -+ ret = -ENOENT; -+ goto err; -+ } -+ bkey_reassemble(&stripe->key, k); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -+{ -+ return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); -+} -+ -+/* recovery read path: */ -+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) -+{ -+ struct ec_stripe_buf *buf; -+ struct closure cl; -+ struct bch_stripe *v; -+ unsigned i, offset; -+ int ret = 0; -+ -+ closure_init_stack(&cl); -+ -+ BUG_ON(!rbio->pick.has_ec); -+ -+ buf = kzalloc(sizeof(*buf), GFP_NOFS); -+ if (!buf) -+ return -BCH_ERR_ENOMEM_ec_read_extent; -+ -+ ret = get_stripe_key(c, rbio->pick.ec.idx, buf); -+ if (ret) { -+ bch_err_ratelimited(c, -+ "error doing reconstruct read: error %i looking up stripe", ret); -+ kfree(buf); -+ return -EIO; -+ } -+ -+ v = &bkey_i_to_stripe(&buf->key)->v; -+ -+ if (!bch2_ptr_matches_stripe(v, rbio->pick)) { -+ bch_err_ratelimited(c, -+ "error doing reconstruct read: pointer doesn't match stripe"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; -+ if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { -+ bch_err_ratelimited(c, -+ "error doing reconstruct read: read is bigger than stripe"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < v->nr_blocks; i++) -+ ec_block_io(c, buf, REQ_OP_READ, i, &cl); -+ -+ closure_sync(&cl); -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ bch_err_ratelimited(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ ec_validate_checksums(c, buf); -+ -+ ret = ec_do_recov(c, buf); -+ if (ret) -+ goto err; -+ -+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, -+ buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); -+err: -+ ec_stripe_buf_exit(buf); -+ kfree(buf); -+ return ret; -+} -+ -+/* stripe bucket accounting: */ -+ -+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -+{ -+ ec_stripes_heap n, *h = &c->ec_stripes_heap; -+ -+ if (idx >= h->size) { -+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) -+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -+ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ if (n.size > h->size) { -+ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); -+ n.used = h->used; -+ swap(*h, n); -+ } -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ -+ free_heap(&n); -+ } -+ -+ if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) -+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -+ -+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && -+ !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) -+ return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; -+ -+ return 0; -+} -+ -+static int ec_stripe_mem_alloc(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ return allocate_dropping_locks_errcode(trans, -+ __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); -+} -+ -+/* -+ * Hash table of open stripes: -+ * Stripes that are being created or modified are kept in a hash table, so that -+ * stripe deletion can skip them. -+ */ -+ -+static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) -+{ -+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); -+ struct ec_stripe_new *s; -+ -+ hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) -+ if (s->idx == idx) -+ return true; -+ return false; -+} -+ -+static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) -+{ -+ bool ret = false; -+ -+ spin_lock(&c->ec_stripes_new_lock); -+ ret = __bch2_stripe_is_open(c, idx); -+ spin_unlock(&c->ec_stripes_new_lock); -+ -+ return ret; -+} -+ -+static bool bch2_try_open_stripe(struct bch_fs *c, -+ struct ec_stripe_new *s, -+ u64 idx) -+{ -+ bool ret; -+ -+ spin_lock(&c->ec_stripes_new_lock); -+ ret = !__bch2_stripe_is_open(c, idx); -+ if (ret) { -+ unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); -+ -+ s->idx = idx; -+ hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); -+ } -+ spin_unlock(&c->ec_stripes_new_lock); -+ -+ return ret; -+} -+ -+static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) -+{ -+ BUG_ON(!s->idx); -+ -+ spin_lock(&c->ec_stripes_new_lock); -+ hlist_del_init(&s->hash); -+ spin_unlock(&c->ec_stripes_new_lock); -+ -+ s->idx = 0; -+} -+ -+/* Heap of all existing stripes, ordered by blocks_nonempty */ -+ -+static u64 stripe_idx_to_delete(struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ -+ lockdep_assert_held(&c->ec_stripes_heap_lock); -+ -+ if (h->used && -+ h->data[0].blocks_nonempty == 0 && -+ !bch2_stripe_is_open(c, h->data[0].idx)) -+ return h->data[0].idx; -+ -+ return 0; -+} -+ -+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, -+ struct ec_stripe_heap_entry l, -+ struct ec_stripe_heap_entry r) -+{ -+ return ((l.blocks_nonempty > r.blocks_nonempty) - -+ (l.blocks_nonempty < r.blocks_nonempty)); -+} -+ -+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, -+ size_t i) -+{ -+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); -+ -+ genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -+} -+ -+static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m = genradix_ptr(&c->stripes, idx); -+ -+ BUG_ON(m->heap_idx >= h->used); -+ BUG_ON(h->data[m->heap_idx].idx != idx); -+} -+ -+void bch2_stripes_heap_del(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ heap_verify_backpointer(c, idx); -+ -+ heap_del(&c->ec_stripes_heap, m->heap_idx, -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ mutex_unlock(&c->ec_stripes_heap_lock); -+} -+ -+void bch2_stripes_heap_insert(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ BUG_ON(heap_full(&c->ec_stripes_heap)); -+ -+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { -+ .idx = idx, -+ .blocks_nonempty = m->blocks_nonempty, -+ }), -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+ mutex_unlock(&c->ec_stripes_heap_lock); -+} -+ -+void bch2_stripes_heap_update(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ bool do_deletes; -+ size_t i; -+ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ heap_verify_backpointer(c, idx); -+ -+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; -+ -+ i = m->heap_idx; -+ heap_sift_up(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ heap_sift_down(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+ -+ do_deletes = stripe_idx_to_delete(c) != 0; -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ -+ if (do_deletes) -+ bch2_do_stripe_deletes(c); -+} -+ -+/* stripe deletion */ -+ -+static int ec_stripe_delete(struct btree_trans *trans, u64 idx) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_stripe s; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), -+ BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_stripe) { -+ bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ s = bkey_s_c_to_stripe(k); -+ for (unsigned i = 0; i < s.v->nr_blocks; i++) -+ if (stripe_blockcount_get(s.v, i)) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); -+ printbuf_exit(&buf); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static void ec_stripe_delete_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, ec_stripe_delete_work); -+ struct btree_trans trans; -+ int ret; -+ u64 idx; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ while (1) { -+ mutex_lock(&c->ec_stripes_heap_lock); -+ idx = stripe_idx_to_delete(c); -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ -+ if (!idx) -+ break; -+ -+ ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, -+ ec_stripe_delete(&trans, idx)); -+ if (ret) { -+ bch_err_fn(c, ret); -+ break; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); -+} -+ -+void bch2_do_stripe_deletes(struct bch_fs *c) -+{ -+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && -+ !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); -+} -+ -+/* stripe creation: */ -+ -+static int ec_stripe_key_update(struct btree_trans *trans, -+ struct bkey_i_stripe *new, -+ bool create) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, -+ new->k.p, BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { -+ bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", -+ create ? "creating" : "updating", -+ bch2_bkey_types[k.k->type]); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (k.k->type == KEY_TYPE_stripe) { -+ const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; -+ unsigned i; -+ -+ if (old->nr_blocks != new->v.nr_blocks) { -+ bch_err(c, "error updating stripe: nr_blocks does not match"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ for (i = 0; i < new->v.nr_blocks; i++) { -+ unsigned v = stripe_blockcount_get(old, i); -+ -+ BUG_ON(v && -+ (old->ptrs[i].dev != new->v.ptrs[i].dev || -+ old->ptrs[i].gen != new->v.ptrs[i].gen || -+ old->ptrs[i].offset != new->v.ptrs[i].offset)); -+ -+ stripe_blockcount_set(&new->v, i, v); -+ } -+ } -+ -+ ret = bch2_trans_update(trans, &iter, &new->k_i, 0); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int ec_stripe_update_extent(struct btree_trans *trans, -+ struct bpos bucket, u8 gen, -+ struct ec_stripe_buf *s, -+ struct bpos *bp_pos) -+{ -+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; -+ struct bch_fs *c = trans->c; -+ struct bch_backpointer bp; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ const struct bch_extent_ptr *ptr_c; -+ struct bch_extent_ptr *ptr, *ec_ptr = NULL; -+ struct bch_extent_stripe_ptr stripe_ptr; -+ struct bkey_i *n; -+ int ret, dev, block; -+ -+ ret = bch2_get_next_backpointer(trans, bucket, gen, -+ bp_pos, &bp, BTREE_ITER_CACHED); -+ if (ret) -+ return ret; -+ if (bpos_eq(*bp_pos, SPOS_MAX)) -+ return 0; -+ -+ if (bp.level) { -+ struct printbuf buf = PRINTBUF; -+ struct btree_iter node_iter; -+ struct btree *b; -+ -+ b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); -+ bch2_trans_iter_exit(trans, &node_iter); -+ -+ if (!b) -+ return 0; -+ -+ prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); -+ bch2_backpointer_to_text(&buf, &bp); -+ -+ bch2_fs_inconsistent(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ return -EIO; -+ } -+ -+ k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ if (!k.k) { -+ /* -+ * extent no longer exists - we could flush the btree -+ * write buffer and retry to verify, but no need: -+ */ -+ return 0; -+ } -+ -+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) -+ goto out; -+ -+ ptr_c = bkey_matches_stripe(v, k, &block); -+ /* -+ * It doesn't generally make sense to erasure code cached ptrs: -+ * XXX: should we be incrementing a counter? -+ */ -+ if (!ptr_c || ptr_c->cached) -+ goto out; -+ -+ dev = v->ptrs[block].dev; -+ -+ n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto out; -+ -+ bkey_reassemble(n, k); -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); -+ ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); -+ BUG_ON(!ec_ptr); -+ -+ stripe_ptr = (struct bch_extent_stripe_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, -+ .block = block, -+ .redundancy = v->nr_redundant, -+ .idx = s->key.k.p.offset, -+ }; -+ -+ __extent_entry_insert(n, -+ (union bch_extent_entry *) ec_ptr, -+ (union bch_extent_entry *) &stripe_ptr); -+ -+ ret = bch2_trans_update(trans, &iter, n, 0); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, -+ unsigned block) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; -+ struct bch_extent_ptr bucket = v->ptrs[block]; -+ struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); -+ struct bpos bp_pos = POS_MIN; -+ int ret = 0; -+ -+ while (1) { -+ ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL, -+ ec_stripe_update_extent(trans, bucket_pos, bucket.gen, -+ s, &bp_pos)); -+ if (ret) -+ break; -+ if (bkey_eq(bp_pos, POS_MAX)) -+ break; -+ -+ bp_pos = bpos_nosnap_successor(bp_pos); -+ } -+ -+ return ret; -+} -+ -+static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) -+{ -+ struct btree_trans trans; -+ struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; -+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_btree_write_buffer_flush(&trans); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < nr_data; i++) { -+ ret = ec_stripe_update_bucket(&trans, s, i); -+ if (ret) -+ break; -+ } -+err: -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static void zero_out_rest_of_ec_bucket(struct bch_fs *c, -+ struct ec_stripe_new *s, -+ unsigned block, -+ struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); -+ unsigned offset = ca->mi.bucket_size - ob->sectors_free; -+ int ret; -+ -+ if (!bch2_dev_get_ioref(ca, WRITE)) { -+ s->err = -BCH_ERR_erofs_no_writes; -+ return; -+ } -+ -+ memset(s->new_stripe.data[block] + (offset << 9), -+ 0, -+ ob->sectors_free << 9); -+ -+ ret = blkdev_issue_zeroout(ca->disk_sb.bdev, -+ ob->bucket * ca->mi.bucket_size + offset, -+ ob->sectors_free, -+ GFP_KERNEL, 0); -+ -+ percpu_ref_put(&ca->io_ref); -+ -+ if (ret) -+ s->err = ret; -+} -+ -+void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) -+{ -+ if (s->idx) -+ bch2_stripe_close(c, s); -+ kfree(s); -+} -+ -+/* -+ * data buckets of new stripe all written: create the stripe -+ */ -+static void ec_stripe_create(struct ec_stripe_new *s) -+{ -+ struct bch_fs *c = s->c; -+ struct open_bucket *ob; -+ struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; -+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -+ int ret; -+ -+ BUG_ON(s->h->s == s); -+ -+ closure_sync(&s->iodone); -+ -+ if (!s->err) { -+ for (i = 0; i < nr_data; i++) -+ if (s->blocks[i]) { -+ ob = c->open_buckets + s->blocks[i]; -+ -+ if (ob->sectors_free) -+ zero_out_rest_of_ec_bucket(c, s, i, ob); -+ } -+ } -+ -+ if (s->err) { -+ if (!bch2_err_matches(s->err, EROFS)) -+ bch_err(c, "error creating stripe: error writing data buckets"); -+ goto err; -+ } -+ -+ if (s->have_existing_stripe) { -+ ec_validate_checksums(c, &s->existing_stripe); -+ -+ if (ec_do_recov(c, &s->existing_stripe)) { -+ bch_err(c, "error creating stripe: error reading existing stripe"); -+ goto err; -+ } -+ -+ for (i = 0; i < nr_data; i++) -+ if (stripe_blockcount_get(&bkey_i_to_stripe(&s->existing_stripe.key)->v, i)) -+ swap(s->new_stripe.data[i], -+ s->existing_stripe.data[i]); -+ -+ ec_stripe_buf_exit(&s->existing_stripe); -+ } -+ -+ BUG_ON(!s->allocated); -+ BUG_ON(!s->idx); -+ -+ ec_generate_ec(&s->new_stripe); -+ -+ ec_generate_checksums(&s->new_stripe); -+ -+ /* write p/q: */ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); -+ closure_sync(&s->iodone); -+ -+ if (ec_nr_failed(&s->new_stripe)) { -+ bch_err(c, "error creating stripe: error writing redundancy buckets"); -+ goto err; -+ } -+ -+ ret = bch2_trans_do(c, &s->res, NULL, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL, -+ ec_stripe_key_update(&trans, -+ bkey_i_to_stripe(&s->new_stripe.key), -+ !s->have_existing_stripe)); -+ if (ret) { -+ bch_err(c, "error creating stripe: error creating stripe key"); -+ goto err; -+ } -+ -+ ret = ec_stripe_update_extents(c, &s->new_stripe); -+ if (ret) { -+ bch_err(c, "error creating stripe: error updating pointers: %s", -+ bch2_err_str(ret)); -+ goto err; -+ } -+err: -+ bch2_disk_reservation_put(c, &s->res); -+ -+ for (i = 0; i < v->nr_blocks; i++) -+ if (s->blocks[i]) { -+ ob = c->open_buckets + s->blocks[i]; -+ -+ if (i < nr_data) { -+ ob->ec = NULL; -+ __bch2_open_bucket_put(c, ob); -+ } else { -+ bch2_open_bucket_put(c, ob); -+ } -+ } -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_del(&s->list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ wake_up(&c->ec_stripe_new_wait); -+ -+ ec_stripe_buf_exit(&s->existing_stripe); -+ ec_stripe_buf_exit(&s->new_stripe); -+ closure_debug_destroy(&s->iodone); -+ -+ ec_stripe_new_put(c, s, STRIPE_REF_stripe); -+} -+ -+static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) -+{ -+ struct ec_stripe_new *s; -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry(s, &c->ec_stripe_new_list, list) -+ if (!atomic_read(&s->ref[STRIPE_REF_io])) -+ goto out; -+ s = NULL; -+out: -+ mutex_unlock(&c->ec_stripe_new_lock); -+ -+ return s; -+} -+ -+static void ec_stripe_create_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, -+ struct bch_fs, ec_stripe_create_work); -+ struct ec_stripe_new *s; -+ -+ while ((s = get_pending_stripe(c))) -+ ec_stripe_create(s); -+ -+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); -+} -+ -+void bch2_ec_do_stripe_creates(struct bch_fs *c) -+{ -+ bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); -+ -+ if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); -+} -+ -+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s = h->s; -+ -+ BUG_ON(!s->allocated && !s->err); -+ -+ h->s = NULL; -+ s->pending = true; -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_add(&s->list, &c->ec_stripe_new_list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ -+ ec_stripe_new_put(c, s, STRIPE_REF_io); -+} -+ -+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ s->err = -EIO; -+} -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct bch_dev *ca; -+ unsigned offset; -+ -+ if (!ob) -+ return NULL; -+ -+ BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); -+ -+ ca = bch_dev_bkey_exists(c, ob->dev); -+ offset = ca->mi.bucket_size - ob->sectors_free; -+ -+ return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ unsigned l = *((const unsigned *) _l); -+ unsigned r = *((const unsigned *) _r); -+ -+ return cmp_int(l, r); -+} -+ -+/* pick most common bucket size: */ -+static unsigned pick_blocksize(struct bch_fs *c, -+ struct bch_devs_mask *devs) -+{ -+ struct bch_dev *ca; -+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; -+ struct { -+ unsigned nr, size; -+ } cur = { 0, 0 }, best = { 0, 0 }; -+ -+ for_each_member_device_rcu(ca, c, i, devs) -+ sizes[nr++] = ca->mi.bucket_size; -+ -+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); -+ -+ for (i = 0; i < nr; i++) { -+ if (sizes[i] != cur.size) { -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ cur.nr = 0; -+ cur.size = sizes[i]; -+ } -+ -+ cur.nr++; -+ } -+ -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ return best.size; -+} -+ -+static bool may_create_new_stripe(struct bch_fs *c) -+{ -+ return false; -+} -+ -+static void ec_stripe_key_init(struct bch_fs *c, -+ struct bkey_i *k, -+ unsigned nr_data, -+ unsigned nr_parity, -+ unsigned stripe_size) -+{ -+ struct bkey_i_stripe *s = bkey_stripe_init(k); -+ unsigned u64s; -+ -+ s->v.sectors = cpu_to_le16(stripe_size); -+ s->v.algorithm = 0; -+ s->v.nr_blocks = nr_data + nr_parity; -+ s->v.nr_redundant = nr_parity; -+ s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); -+ s->v.csum_type = BCH_CSUM_crc32c; -+ s->v.pad = 0; -+ -+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { -+ BUG_ON(1 << s->v.csum_granularity_bits >= -+ le16_to_cpu(s->v.sectors) || -+ s->v.csum_granularity_bits == U8_MAX); -+ s->v.csum_granularity_bits++; -+ } -+ -+ set_bkey_val_u64s(&s->k, u64s); -+} -+ -+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s; -+ -+ lockdep_assert_held(&h->lock); -+ -+ s = kzalloc(sizeof(*s), GFP_KERNEL); -+ if (!s) -+ return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; -+ -+ mutex_init(&s->lock); -+ closure_init(&s->iodone, NULL); -+ atomic_set(&s->ref[STRIPE_REF_stripe], 1); -+ atomic_set(&s->ref[STRIPE_REF_io], 1); -+ s->c = c; -+ s->h = h; -+ s->nr_data = min_t(unsigned, h->nr_active_devs, -+ BCH_BKEY_PTRS_MAX) - h->redundancy; -+ s->nr_parity = h->redundancy; -+ -+ ec_stripe_key_init(c, &s->new_stripe.key, -+ s->nr_data, s->nr_parity, h->blocksize); -+ -+ h->s = s; -+ return 0; -+} -+ -+static struct ec_stripe_head * -+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, -+ unsigned algo, unsigned redundancy, -+ enum bch_watermark watermark) -+{ -+ struct ec_stripe_head *h; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ h = kzalloc(sizeof(*h), GFP_KERNEL); -+ if (!h) -+ return NULL; -+ -+ mutex_init(&h->lock); -+ BUG_ON(!mutex_trylock(&h->lock)); -+ -+ h->target = target; -+ h->algo = algo; -+ h->redundancy = redundancy; -+ h->watermark = watermark; -+ -+ rcu_read_lock(); -+ h->devs = target_rw_devs(c, BCH_DATA_user, target); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (!ca->mi.durability) -+ __clear_bit(i, h->devs.d); -+ -+ h->blocksize = pick_blocksize(c, &h->devs); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (ca->mi.bucket_size == h->blocksize) -+ h->nr_active_devs++; -+ -+ rcu_read_unlock(); -+ list_add(&h->list, &c->ec_stripe_head_list); -+ return h; -+} -+ -+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ if (h->s && -+ h->s->allocated && -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->nr_data) == h->s->nr_data) -+ ec_stripe_set_pending(c, h); -+ -+ mutex_unlock(&h->lock); -+} -+ -+static struct ec_stripe_head * -+__bch2_ec_stripe_head_get(struct btree_trans *trans, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy, -+ enum bch_watermark watermark) -+{ -+ struct bch_fs *c = trans->c; -+ struct ec_stripe_head *h; -+ int ret; -+ -+ if (!redundancy) -+ return NULL; -+ -+ ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (test_bit(BCH_FS_GOING_RO, &c->flags)) { -+ h = ERR_PTR(-BCH_ERR_erofs_no_writes); -+ goto found; -+ } -+ -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) -+ if (h->target == target && -+ h->algo == algo && -+ h->redundancy == redundancy && -+ h->watermark == watermark) { -+ ret = bch2_trans_mutex_lock(trans, &h->lock); -+ if (ret) -+ h = ERR_PTR(ret); -+ goto found; -+ } -+ -+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); -+found: -+ mutex_unlock(&c->ec_stripe_head_lock); -+ return h; -+} -+ -+static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, -+ enum bch_watermark watermark, struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_devs_mask devs = h->devs; -+ struct open_bucket *ob; -+ struct open_buckets buckets; -+ struct bch_stripe *v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; -+ unsigned i, j, nr_have_parity = 0, nr_have_data = 0; -+ bool have_cache = true; -+ int ret = 0; -+ -+ BUG_ON(v->nr_blocks != h->s->nr_data + h->s->nr_parity); -+ BUG_ON(v->nr_redundant != h->s->nr_parity); -+ -+ for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) { -+ __clear_bit(v->ptrs[i].dev, devs.d); -+ if (i < h->s->nr_data) -+ nr_have_data++; -+ else -+ nr_have_parity++; -+ } -+ -+ BUG_ON(nr_have_data > h->s->nr_data); -+ BUG_ON(nr_have_parity > h->s->nr_parity); -+ -+ buckets.nr = 0; -+ if (nr_have_parity < h->s->nr_parity) { -+ ret = bch2_bucket_alloc_set_trans(trans, &buckets, -+ &h->parity_stripe, -+ &devs, -+ h->s->nr_parity, -+ &nr_have_parity, -+ &have_cache, 0, -+ BCH_DATA_parity, -+ watermark, -+ cl); -+ -+ open_bucket_for_each(c, &buckets, ob, i) { -+ j = find_next_zero_bit(h->s->blocks_gotten, -+ h->s->nr_data + h->s->nr_parity, -+ h->s->nr_data); -+ BUG_ON(j >= h->s->nr_data + h->s->nr_parity); -+ -+ h->s->blocks[j] = buckets.v[i]; -+ v->ptrs[j] = bch2_ob_ptr(c, ob); -+ __set_bit(j, h->s->blocks_gotten); -+ } -+ -+ if (ret) -+ return ret; -+ } -+ -+ buckets.nr = 0; -+ if (nr_have_data < h->s->nr_data) { -+ ret = bch2_bucket_alloc_set_trans(trans, &buckets, -+ &h->block_stripe, -+ &devs, -+ h->s->nr_data, -+ &nr_have_data, -+ &have_cache, 0, -+ BCH_DATA_user, -+ watermark, -+ cl); -+ -+ open_bucket_for_each(c, &buckets, ob, i) { -+ j = find_next_zero_bit(h->s->blocks_gotten, -+ h->s->nr_data, 0); -+ BUG_ON(j >= h->s->nr_data); -+ -+ h->s->blocks[j] = buckets.v[i]; -+ v->ptrs[j] = bch2_ob_ptr(c, ob); -+ __set_bit(j, h->s->blocks_gotten); -+ } -+ -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* XXX: doesn't obey target: */ -+static s64 get_existing_stripe(struct bch_fs *c, -+ struct ec_stripe_head *head) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t heap_idx; -+ u64 stripe_idx; -+ s64 ret = -1; -+ -+ if (may_create_new_stripe(c)) -+ return -1; -+ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { -+ /* No blocks worth reusing, stripe will just be deleted: */ -+ if (!h->data[heap_idx].blocks_nonempty) -+ continue; -+ -+ stripe_idx = h->data[heap_idx].idx; -+ -+ m = genradix_ptr(&c->stripes, stripe_idx); -+ -+ if (m->algorithm == head->algo && -+ m->nr_redundant == head->redundancy && -+ m->sectors == head->blocksize && -+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant && -+ bch2_try_open_stripe(c, head->s, stripe_idx)) { -+ ret = stripe_idx; -+ break; -+ } -+ } -+ mutex_unlock(&c->ec_stripes_heap_lock); -+ return ret; -+} -+ -+static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_stripe *new_v = &bkey_i_to_stripe(&h->s->new_stripe.key)->v; -+ struct bch_stripe *existing_v; -+ unsigned i; -+ s64 idx; -+ int ret; -+ -+ /* -+ * If we can't allocate a new stripe, and there's no stripes with empty -+ * blocks for us to reuse, that means we have to wait on copygc: -+ */ -+ idx = get_existing_stripe(c, h); -+ if (idx < 0) -+ return -BCH_ERR_stripe_alloc_blocked; -+ -+ ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); -+ if (ret) { -+ bch2_stripe_close(c, h->s); -+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ existing_v = &bkey_i_to_stripe(&h->s->existing_stripe.key)->v; -+ -+ BUG_ON(existing_v->nr_redundant != h->s->nr_parity); -+ h->s->nr_data = existing_v->nr_blocks - -+ existing_v->nr_redundant; -+ -+ ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); -+ if (ret) { -+ bch2_stripe_close(c, h->s); -+ return ret; -+ } -+ -+ BUG_ON(h->s->existing_stripe.size != h->blocksize); -+ BUG_ON(h->s->existing_stripe.size != le16_to_cpu(existing_v->sectors)); -+ -+ /* -+ * Free buckets we initially allocated - they might conflict with -+ * blocks from the stripe we're reusing: -+ */ -+ for_each_set_bit(i, h->s->blocks_gotten, new_v->nr_blocks) { -+ bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); -+ h->s->blocks[i] = 0; -+ } -+ memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); -+ memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); -+ -+ for (i = 0; i < existing_v->nr_blocks; i++) { -+ if (stripe_blockcount_get(existing_v, i)) { -+ __set_bit(i, h->s->blocks_gotten); -+ __set_bit(i, h->s->blocks_allocated); -+ } -+ -+ ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); -+ } -+ -+ bkey_copy(&h->s->new_stripe.key, &h->s->existing_stripe.key); -+ h->s->have_existing_stripe = true; -+ -+ return 0; -+} -+ -+static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bpos min_pos = POS(0, 1); -+ struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); -+ int ret; -+ -+ if (!h->s->res.sectors) { -+ ret = bch2_disk_reservation_get(c, &h->s->res, -+ h->blocksize, -+ h->s->nr_parity, -+ BCH_DISK_RESERVATION_NOFAIL); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_gt(k.k->p, POS(0, U32_MAX))) { -+ if (start_pos.offset) { -+ start_pos = min_pos; -+ bch2_btree_iter_set_pos(&iter, start_pos); -+ continue; -+ } -+ -+ ret = -BCH_ERR_ENOSPC_stripe_create; -+ break; -+ } -+ -+ if (bkey_deleted(k.k) && -+ bch2_try_open_stripe(c, h->s, k.k->p.offset)) -+ break; -+ } -+ -+ c->ec_stripe_hint = iter.pos.offset; -+ -+ if (ret) -+ goto err; -+ -+ ret = ec_stripe_mem_alloc(trans, &iter); -+ if (ret) { -+ bch2_stripe_close(c, h->s); -+ goto err; -+ } -+ -+ h->s->new_stripe.key.k.p = iter.pos; -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+err: -+ bch2_disk_reservation_put(c, &h->s->res); -+ goto out; -+} -+ -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy, -+ enum bch_watermark watermark, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct ec_stripe_head *h; -+ bool waiting = false; -+ int ret; -+ -+ h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); -+ if (!h) -+ bch_err(c, "no stripe head"); -+ if (IS_ERR_OR_NULL(h)) -+ return h; -+ -+ if (!h->s) { -+ ret = ec_new_stripe_alloc(c, h); -+ if (ret) { -+ bch_err(c, "failed to allocate new stripe"); -+ goto err; -+ } -+ } -+ -+ if (h->s->allocated) -+ goto allocated; -+ -+ if (h->s->have_existing_stripe) -+ goto alloc_existing; -+ -+ /* First, try to allocate a full stripe: */ -+ ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: -+ __bch2_ec_stripe_head_reserve(trans, h); -+ if (!ret) -+ goto allocate_buf; -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -+ bch2_err_matches(ret, ENOMEM)) -+ goto err; -+ -+ /* -+ * Not enough buckets available for a full stripe: we must reuse an -+ * existing stripe: -+ */ -+ while (1) { -+ ret = __bch2_ec_stripe_head_reuse(trans, h); -+ if (!ret) -+ break; -+ if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) -+ goto err; -+ -+ if (watermark == BCH_WATERMARK_copygc) { -+ ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: -+ __bch2_ec_stripe_head_reserve(trans, h); -+ if (ret) -+ goto err; -+ goto allocate_buf; -+ } -+ -+ /* XXX freelist_wait? */ -+ closure_wait(&c->freelist_wait, cl); -+ waiting = true; -+ } -+ -+ if (waiting) -+ closure_wake_up(&c->freelist_wait); -+alloc_existing: -+ /* -+ * Retry allocating buckets, with the watermark for this -+ * particular write: -+ */ -+ ret = new_stripe_alloc_buckets(trans, h, watermark, cl); -+ if (ret) -+ goto err; -+ -+allocate_buf: -+ ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); -+ if (ret) -+ goto err; -+ -+ h->s->allocated = true; -+allocated: -+ BUG_ON(!h->s->idx); -+ BUG_ON(!h->s->new_stripe.data[0]); -+ BUG_ON(trans->restarted); -+ return h; -+err: -+ bch2_ec_stripe_head_put(c, h); -+ return ERR_PTR(ret); -+} -+ -+static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ mutex_lock(&h->lock); -+ if (!h->s) -+ goto unlock; -+ -+ if (!ca) -+ goto found; -+ -+ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) { -+ if (!h->s->blocks[i]) -+ continue; -+ -+ ob = c->open_buckets + h->s->blocks[i]; -+ if (ob->dev == ca->dev_idx) -+ goto found; -+ } -+ goto unlock; -+found: -+ h->s->err = -BCH_ERR_erofs_no_writes; -+ ec_stripe_set_pending(c, h); -+unlock: -+ mutex_unlock(&h->lock); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+} -+ -+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -+{ -+ __bch2_ec_stop(c, ca); -+} -+ -+void bch2_fs_ec_stop(struct bch_fs *c) -+{ -+ __bch2_ec_stop(c, NULL); -+} -+ -+static bool bch2_fs_ec_flush_done(struct bch_fs *c) -+{ -+ bool ret; -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ ret = list_empty(&c->ec_stripe_new_list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ -+ return ret; -+} -+ -+void bch2_fs_ec_flush(struct bch_fs *c) -+{ -+ wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); -+} -+ -+int bch2_stripes_read(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ const struct bch_stripe *s; -+ struct stripe *m; -+ unsigned i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->type != KEY_TYPE_stripe) -+ continue; -+ -+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); -+ if (ret) -+ break; -+ -+ s = bkey_s_c_to_stripe(k).v; -+ -+ m = genradix_ptr(&c->stripes, k.k->p.offset); -+ m->sectors = le16_to_cpu(s->sectors); -+ m->algorithm = s->algorithm; -+ m->nr_blocks = s->nr_blocks; -+ m->nr_redundant = s->nr_redundant; -+ m->blocks_nonempty = 0; -+ -+ for (i = 0; i < s->nr_blocks; i++) -+ m->blocks_nonempty += !!stripe_blockcount_get(s, i); -+ -+ bch2_stripes_heap_insert(c, m, k.k->p.offset); -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ return ret; -+} -+ -+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t i; -+ -+ mutex_lock(&c->ec_stripes_heap_lock); -+ for (i = 0; i < min_t(size_t, h->used, 50); i++) { -+ m = genradix_ptr(&c->stripes, h->data[i].idx); -+ -+ prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, -+ h->data[i].blocks_nonempty, -+ m->nr_blocks - m->nr_redundant, -+ m->nr_redundant); -+ if (bch2_stripe_is_open(c, h->data[i].idx)) -+ prt_str(out, " open"); -+ prt_newline(out); -+ } -+ mutex_unlock(&c->ec_stripes_heap_lock); -+} -+ -+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ struct ec_stripe_new *s; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ prt_printf(out, "target %u algo %u redundancy %u %s:\n", -+ h->target, h->algo, h->redundancy, -+ bch2_watermarks[h->watermark]); -+ -+ if (h->s) -+ prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", -+ h->s->idx, h->s->nr_data, h->s->nr_parity, -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->nr_data)); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+ -+ prt_printf(out, "in flight:\n"); -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry(s, &c->ec_stripe_new_list, list) { -+ prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", -+ s->idx, s->nr_data, s->nr_parity, -+ atomic_read(&s->ref[STRIPE_REF_io]), -+ atomic_read(&s->ref[STRIPE_REF_stripe]), -+ bch2_watermarks[s->h->watermark]); -+ } -+ mutex_unlock(&c->ec_stripe_new_lock); -+} -+ -+void bch2_fs_ec_exit(struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ unsigned i; -+ -+ while (1) { -+ mutex_lock(&c->ec_stripe_head_lock); -+ h = list_first_entry_or_null(&c->ec_stripe_head_list, -+ struct ec_stripe_head, list); -+ if (h) -+ list_del(&h->list); -+ mutex_unlock(&c->ec_stripe_head_lock); -+ if (!h) -+ break; -+ -+ if (h->s) { -+ for (i = 0; i < bkey_i_to_stripe(&h->s->new_stripe.key)->v.nr_blocks; i++) -+ BUG_ON(h->s->blocks[i]); -+ -+ kfree(h->s); -+ } -+ kfree(h); -+ } -+ -+ BUG_ON(!list_empty(&c->ec_stripe_new_list)); -+ -+ free_heap(&c->ec_stripes_heap); -+ genradix_free(&c->stripes); -+ bioset_exit(&c->ec_bioset); -+} -+ -+void bch2_fs_ec_init_early(struct bch_fs *c) -+{ -+ spin_lock_init(&c->ec_stripes_new_lock); -+ mutex_init(&c->ec_stripes_heap_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_head_list); -+ mutex_init(&c->ec_stripe_head_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_new_list); -+ mutex_init(&c->ec_stripe_new_lock); -+ init_waitqueue_head(&c->ec_stripe_new_wait); -+ -+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); -+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -+} -+ -+int bch2_fs_ec_init(struct bch_fs *c) -+{ -+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), -+ BIOSET_NEED_BVECS); -+} -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -new file mode 100644 -index 000000000..885ae5d51 ---- /dev/null -+++ b/fs/bcachefs/ec.h -@@ -0,0 +1,260 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_H -+#define _BCACHEFS_EC_H -+ -+#include "ec_types.h" -+#include "buckets_types.h" -+#include "extents_types.h" -+ -+enum bkey_invalid_flags; -+ -+int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ -+ .key_invalid = bch2_stripe_invalid, \ -+ .val_to_text = bch2_stripe_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .trans_trigger = bch2_trans_mark_stripe, \ -+ .atomic_trigger = bch2_mark_stripe, \ -+ .min_val_size = 8, \ -+}) -+ -+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(le16_to_cpu(s->sectors), -+ 1 << s->csum_granularity_bits); -+} -+ -+static inline unsigned stripe_csum_offset(const struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; -+ -+ return sizeof(struct bch_stripe) + -+ sizeof(struct bch_extent_ptr) * s->nr_blocks + -+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -+} -+ -+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return stripe_csum_offset(s, s->nr_blocks, 0) + -+ sizeof(u16) * idx; -+} -+ -+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -+} -+ -+static inline void stripe_blockcount_set(struct bch_stripe *s, -+ unsigned idx, unsigned v) -+{ -+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); -+ -+ *p = cpu_to_le16(v); -+} -+ -+static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), -+ sizeof(u64)); -+} -+ -+static inline void *stripe_csum(struct bch_stripe *s, -+ unsigned block, unsigned csum_idx) -+{ -+ EBUG_ON(block >= s->nr_blocks); -+ EBUG_ON(csum_idx >= stripe_csums_per_device(s)); -+ -+ return (void *) s + stripe_csum_offset(s, block, csum_idx); -+} -+ -+static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, -+ unsigned block, unsigned csum_idx) -+{ -+ struct bch_csum csum = { 0 }; -+ -+ memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); -+ return csum; -+} -+ -+static inline void stripe_csum_set(struct bch_stripe *s, -+ unsigned block, unsigned csum_idx, -+ struct bch_csum csum) -+{ -+ memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); -+} -+ -+static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, -+ const struct bch_extent_ptr *data_ptr, -+ unsigned sectors) -+{ -+ return data_ptr->dev == stripe_ptr->dev && -+ data_ptr->gen == stripe_ptr->gen && -+ data_ptr->offset >= stripe_ptr->offset && -+ data_ptr->offset < stripe_ptr->offset + sectors; -+} -+ -+static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, -+ struct extent_ptr_decoded p) -+{ -+ unsigned nr_data = s->nr_blocks - s->nr_redundant; -+ -+ BUG_ON(!p.has_ec); -+ -+ if (p.ec.block >= nr_data) -+ return false; -+ -+ return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, -+ le16_to_cpu(s->sectors)); -+} -+ -+static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, -+ struct extent_ptr_decoded p) -+{ -+ unsigned nr_data = m->nr_blocks - m->nr_redundant; -+ -+ BUG_ON(!p.has_ec); -+ -+ if (p.ec.block >= nr_data) -+ return false; -+ -+ return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, -+ m->sectors); -+} -+ -+struct bch_read_bio; -+ -+struct ec_stripe_buf { -+ /* might not be buffering the entire stripe: */ -+ unsigned offset; -+ unsigned size; -+ unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; -+ -+ void *data[BCH_BKEY_PTRS_MAX]; -+ -+ __BKEY_PADDED(key, 255); -+}; -+ -+struct ec_stripe_head; -+ -+enum ec_stripe_ref { -+ STRIPE_REF_io, -+ STRIPE_REF_stripe, -+ STRIPE_REF_NR -+}; -+ -+struct ec_stripe_new { -+ struct bch_fs *c; -+ struct ec_stripe_head *h; -+ struct mutex lock; -+ struct list_head list; -+ -+ struct hlist_node hash; -+ u64 idx; -+ -+ struct closure iodone; -+ -+ atomic_t ref[STRIPE_REF_NR]; -+ -+ int err; -+ -+ u8 nr_data; -+ u8 nr_parity; -+ bool allocated; -+ bool pending; -+ bool have_existing_stripe; -+ -+ unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; -+ unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; -+ open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; -+ struct disk_reservation res; -+ -+ struct ec_stripe_buf new_stripe; -+ struct ec_stripe_buf existing_stripe; -+}; -+ -+struct ec_stripe_head { -+ struct list_head list; -+ struct mutex lock; -+ -+ unsigned target; -+ unsigned algo; -+ unsigned redundancy; -+ enum bch_watermark watermark; -+ -+ struct bch_devs_mask devs; -+ unsigned nr_active_devs; -+ -+ unsigned blocksize; -+ -+ struct dev_stripe_state block_stripe; -+ struct dev_stripe_state parity_stripe; -+ -+ struct ec_stripe_new *s; -+}; -+ -+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -+ -+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); -+ -+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -+ -+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, -+ unsigned, unsigned, unsigned, -+ enum bch_watermark, struct closure *); -+ -+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -+ -+void bch2_do_stripe_deletes(struct bch_fs *); -+void bch2_ec_do_stripe_creates(struct bch_fs *); -+void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); -+ -+static inline void ec_stripe_new_get(struct ec_stripe_new *s, -+ enum ec_stripe_ref ref) -+{ -+ atomic_inc(&s->ref[ref]); -+} -+ -+static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, -+ enum ec_stripe_ref ref) -+{ -+ BUG_ON(atomic_read(&s->ref[ref]) <= 0); -+ -+ if (atomic_dec_and_test(&s->ref[ref])) -+ switch (ref) { -+ case STRIPE_REF_stripe: -+ bch2_ec_stripe_new_free(c, s); -+ break; -+ case STRIPE_REF_io: -+ bch2_ec_do_stripe_creates(c); -+ break; -+ default: -+ unreachable(); -+ } -+} -+ -+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -+void bch2_fs_ec_stop(struct bch_fs *); -+void bch2_fs_ec_flush(struct bch_fs *); -+ -+int bch2_stripes_read(struct bch_fs *); -+ -+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); -+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_ec_exit(struct bch_fs *); -+void bch2_fs_ec_init_early(struct bch_fs *); -+int bch2_fs_ec_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_EC_H */ -diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h -new file mode 100644 -index 000000000..e2b02a82d ---- /dev/null -+++ b/fs/bcachefs/ec_types.h -@@ -0,0 +1,41 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_TYPES_H -+#define _BCACHEFS_EC_TYPES_H -+ -+#include "bcachefs_format.h" -+ -+struct bch_replicas_padded { -+ struct bch_replicas_entry e; -+ u8 pad[BCH_BKEY_PTRS_MAX]; -+}; -+ -+struct stripe { -+ size_t heap_idx; -+ u16 sectors; -+ u8 algorithm; -+ u8 nr_blocks; -+ u8 nr_redundant; -+ u8 blocks_nonempty; -+}; -+ -+struct gc_stripe { -+ u16 sectors; -+ -+ u8 nr_blocks; -+ u8 nr_redundant; -+ -+ unsigned alive:1; /* does a corresponding key exist in stripes btree? */ -+ u16 block_sectors[BCH_BKEY_PTRS_MAX]; -+ struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; -+ -+ struct bch_replicas_padded r; -+}; -+ -+struct ec_stripe_heap_entry { -+ size_t idx; -+ unsigned blocks_nonempty; -+}; -+ -+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; -+ -+#endif /* _BCACHEFS_EC_TYPES_H */ -diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c -new file mode 100644 -index 000000000..dc906fc91 ---- /dev/null -+++ b/fs/bcachefs/errcode.c -@@ -0,0 +1,63 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "errcode.h" -+ -+#include -+ -+static const char * const bch2_errcode_strs[] = { -+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, -+ BCH_ERRCODES() -+#undef x -+ NULL -+}; -+ -+#define BCH_ERR_0 0 -+ -+static unsigned bch2_errcode_parents[] = { -+#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, -+ BCH_ERRCODES() -+#undef x -+}; -+ -+const char *bch2_err_str(int err) -+{ -+ const char *errstr; -+ -+ err = abs(err); -+ -+ BUG_ON(err >= BCH_ERR_MAX); -+ -+ if (err >= BCH_ERR_START) -+ errstr = bch2_errcode_strs[err - BCH_ERR_START]; -+ else if (err) -+ errstr = errname(err); -+ else -+ errstr = "(No error)"; -+ return errstr ?: "(Invalid error)"; -+} -+ -+bool __bch2_err_matches(int err, int class) -+{ -+ err = abs(err); -+ class = abs(class); -+ -+ BUG_ON(err >= BCH_ERR_MAX); -+ BUG_ON(class >= BCH_ERR_MAX); -+ -+ while (err >= BCH_ERR_START && err != class) -+ err = bch2_errcode_parents[err - BCH_ERR_START]; -+ -+ return err == class; -+} -+ -+int __bch2_err_class(int err) -+{ -+ err = -err; -+ BUG_ON((unsigned) err >= BCH_ERR_MAX); -+ -+ while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) -+ err = bch2_errcode_parents[err - BCH_ERR_START]; -+ -+ return -err; -+} -diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h -new file mode 100644 -index 000000000..f7fa87442 ---- /dev/null -+++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,252 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ERRCODE_H -+#define _BCACHEFS_ERRCODE_H -+ -+#define BCH_ERRCODES() \ -+ x(ENOMEM, ENOMEM_stripe_buf) \ -+ x(ENOMEM, ENOMEM_replicas_table) \ -+ x(ENOMEM, ENOMEM_cpu_replicas) \ -+ x(ENOMEM, ENOMEM_replicas_gc) \ -+ x(ENOMEM, ENOMEM_disk_groups_validate) \ -+ x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ -+ x(ENOMEM, ENOMEM_mark_snapshot) \ -+ x(ENOMEM, ENOMEM_mark_stripe) \ -+ x(ENOMEM, ENOMEM_mark_stripe_ptr) \ -+ x(ENOMEM, ENOMEM_btree_key_cache_create) \ -+ x(ENOMEM, ENOMEM_btree_key_cache_fill) \ -+ x(ENOMEM, ENOMEM_btree_key_cache_insert) \ -+ x(ENOMEM, ENOMEM_trans_kmalloc) \ -+ x(ENOMEM, ENOMEM_trans_log_msg) \ -+ x(ENOMEM, ENOMEM_do_encrypt) \ -+ x(ENOMEM, ENOMEM_ec_read_extent) \ -+ x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ -+ x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ -+ x(ENOMEM, ENOMEM_fs_btree_cache_init) \ -+ x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ -+ x(ENOMEM, ENOMEM_fs_counters_init) \ -+ x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ -+ x(ENOMEM, ENOMEM_io_clock_init) \ -+ x(ENOMEM, ENOMEM_blacklist_table_init) \ -+ x(ENOMEM, ENOMEM_sb_realloc_injected) \ -+ x(ENOMEM, ENOMEM_sb_bio_realloc) \ -+ x(ENOMEM, ENOMEM_sb_buf_realloc) \ -+ x(ENOMEM, ENOMEM_sb_journal_validate) \ -+ x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ -+ x(ENOMEM, ENOMEM_journal_entry_add) \ -+ x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ -+ x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ -+ x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ -+ x(ENOMEM, ENOMEM_bio_read_init) \ -+ x(ENOMEM, ENOMEM_bio_read_split_init) \ -+ x(ENOMEM, ENOMEM_bio_write_init) \ -+ x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ -+ x(ENOMEM, ENOMEM_writepage_bioset_init) \ -+ x(ENOMEM, ENOMEM_dio_read_bioset_init) \ -+ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ -+ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ -+ x(ENOMEM, ENOMEM_promote_table_init) \ -+ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ -+ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ -+ x(ENOMEM, ENOMEM_compression_workspace_init) \ -+ x(ENOMEM, ENOMEM_decompression_workspace_init) \ -+ x(ENOMEM, ENOMEM_bucket_gens) \ -+ x(ENOMEM, ENOMEM_buckets_nouse) \ -+ x(ENOMEM, ENOMEM_usage_init) \ -+ x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ -+ x(ENOMEM, ENOMEM_btree_node_reclaim) \ -+ x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ -+ x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ -+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ -+ x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ -+ x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ -+ x(ENOMEM, ENOMEM_dev_journal_init) \ -+ x(ENOMEM, ENOMEM_journal_pin_fifo) \ -+ x(ENOMEM, ENOMEM_journal_buf) \ -+ x(ENOMEM, ENOMEM_gc_start) \ -+ x(ENOMEM, ENOMEM_gc_alloc_start) \ -+ x(ENOMEM, ENOMEM_gc_reflink_start) \ -+ x(ENOMEM, ENOMEM_gc_gens) \ -+ x(ENOMEM, ENOMEM_gc_repair_key) \ -+ x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ -+ x(ENOMEM, ENOMEM_fsck_add_nlink) \ -+ x(ENOMEM, ENOMEM_journal_key_insert) \ -+ x(ENOMEM, ENOMEM_journal_keys_sort) \ -+ x(ENOMEM, ENOMEM_journal_replay) \ -+ x(ENOMEM, ENOMEM_read_superblock_clean) \ -+ x(ENOMEM, ENOMEM_fs_alloc) \ -+ x(ENOMEM, ENOMEM_fs_name_alloc) \ -+ x(ENOMEM, ENOMEM_fs_other_alloc) \ -+ x(ENOMEM, ENOMEM_dev_alloc) \ -+ x(ENOSPC, ENOSPC_disk_reservation) \ -+ x(ENOSPC, ENOSPC_bucket_alloc) \ -+ x(ENOSPC, ENOSPC_disk_label_add) \ -+ x(ENOSPC, ENOSPC_stripe_create) \ -+ x(ENOSPC, ENOSPC_inode_create) \ -+ x(ENOSPC, ENOSPC_str_hash_create) \ -+ x(ENOSPC, ENOSPC_snapshot_create) \ -+ x(ENOSPC, ENOSPC_subvolume_create) \ -+ x(ENOSPC, ENOSPC_sb) \ -+ x(ENOSPC, ENOSPC_sb_journal) \ -+ x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ -+ x(ENOSPC, ENOSPC_sb_quota) \ -+ x(ENOSPC, ENOSPC_sb_replicas) \ -+ x(ENOSPC, ENOSPC_sb_members) \ -+ x(ENOSPC, ENOSPC_sb_crypt) \ -+ x(ENOSPC, ENOSPC_btree_slot) \ -+ x(ENOSPC, ENOSPC_snapshot_tree) \ -+ x(ENOENT, ENOENT_bkey_type_mismatch) \ -+ x(ENOENT, ENOENT_str_hash_lookup) \ -+ x(ENOENT, ENOENT_str_hash_set_must_replace) \ -+ x(ENOENT, ENOENT_inode) \ -+ x(ENOENT, ENOENT_not_subvol) \ -+ x(ENOENT, ENOENT_directory_dead) \ -+ x(ENOENT, ENOENT_subvolume) \ -+ x(ENOENT, ENOENT_snapshot_tree) \ -+ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ -+ x(ENOENT, ENOENT_dev_not_found) \ -+ x(ENOENT, ENOENT_dev_idx_not_found) \ -+ x(0, open_buckets_empty) \ -+ x(0, freelist_empty) \ -+ x(BCH_ERR_freelist_empty, no_buckets_found) \ -+ x(0, transaction_restart) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_relock) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ -+ x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ -+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ -+ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ -+ x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ -+ x(BCH_ERR_transaction_restart, transaction_restart_nested) \ -+ x(0, no_btree_node) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_relock) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_drop) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_up) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_down) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_init) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_cached) \ -+ x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ -+ x(0, btree_insert_fail) \ -+ x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ -+ x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ -+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ -+ x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ -+ x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ -+ x(0, backpointer_to_overwritten_btree_node) \ -+ x(0, lock_fail_root_changed) \ -+ x(0, journal_reclaim_would_deadlock) \ -+ x(EINVAL, fsck) \ -+ x(BCH_ERR_fsck, fsck_fix) \ -+ x(BCH_ERR_fsck, fsck_ignore) \ -+ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ -+ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ -+ x(BCH_ERR_fsck, fsck_repair_impossible) \ -+ x(0, restart_recovery) \ -+ x(0, unwritten_extent_update) \ -+ x(EINVAL, device_state_not_allowed) \ -+ x(EINVAL, member_info_missing) \ -+ x(EINVAL, mismatched_block_size) \ -+ x(EINVAL, block_size_too_small) \ -+ x(EINVAL, bucket_size_too_small) \ -+ x(EINVAL, device_size_too_small) \ -+ x(EINVAL, device_not_a_member_of_filesystem) \ -+ x(EINVAL, device_has_been_removed) \ -+ x(EINVAL, device_already_online) \ -+ x(EINVAL, insufficient_devices_to_start) \ -+ x(EINVAL, invalid) \ -+ x(EINVAL, internal_fsck_err) \ -+ x(EROFS, erofs_trans_commit) \ -+ x(EROFS, erofs_no_writes) \ -+ x(EROFS, erofs_journal_err) \ -+ x(EROFS, erofs_sb_err) \ -+ x(EROFS, erofs_unfixed_errors) \ -+ x(EROFS, erofs_norecovery) \ -+ x(EROFS, erofs_nochanges) \ -+ x(EROFS, insufficient_devices) \ -+ x(0, operation_blocked) \ -+ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ -+ x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ -+ x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ -+ x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ -+ x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ -+ x(BCH_ERR_invalid, invalid_sb) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_version) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_features) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_layout) \ -+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ -+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ -+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_members) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_journal) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_clean) \ -+ x(BCH_ERR_invalid_sb, invalid_sb_quota) \ -+ x(BCH_ERR_invalid, invalid_bkey) \ -+ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ -+ x(EIO, btree_node_read_err) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_bad_node) \ -+ x(BCH_ERR_btree_node_read_err, btree_node_read_err_incompatible) -+ -+enum bch_errcode { -+ BCH_ERR_START = 2048, -+#define x(class, err) BCH_ERR_##err, -+ BCH_ERRCODES() -+#undef x -+ BCH_ERR_MAX -+}; -+ -+const char *bch2_err_str(int); -+bool __bch2_err_matches(int, int); -+ -+static inline bool _bch2_err_matches(int err, int class) -+{ -+ return err < 0 && __bch2_err_matches(err, class); -+} -+ -+#define bch2_err_matches(_err, _class) \ -+({ \ -+ BUILD_BUG_ON(!__builtin_constant_p(_class)); \ -+ unlikely(_bch2_err_matches(_err, _class)); \ -+}) -+ -+int __bch2_err_class(int); -+ -+static inline long bch2_err_class(long err) -+{ -+ return err < 0 ? __bch2_err_class(err) : err; -+} -+ -+#endif /* _BCACHFES_ERRCODE_H */ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -new file mode 100644 -index 000000000..39009cf0c ---- /dev/null -+++ b/fs/bcachefs/error.c -@@ -0,0 +1,294 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "error.h" -+#include "io.h" -+#include "super.h" -+ -+#define FSCK_ERR_RATELIMIT_NR 10 -+ -+bool bch2_inconsistent_error(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_ERROR, &c->flags); -+ -+ switch (c->opts.errors) { -+ case BCH_ON_ERROR_continue: -+ return false; -+ case BCH_ON_ERROR_ro: -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "inconsistency detected - emergency read only"); -+ return true; -+ case BCH_ON_ERROR_panic: -+ panic(bch2_fmt(c, "panic after error")); -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_topology_error(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) -+ bch2_inconsistent_error(c); -+} -+ -+void bch2_fatal_error(struct bch_fs *c) -+{ -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "fatal error - emergency read only"); -+} -+ -+void bch2_io_error_work(struct work_struct *work) -+{ -+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); -+ struct bch_fs *c = ca->fs; -+ bool dev; -+ -+ down_write(&c->state_lock); -+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, -+ BCH_FORCE_IF_DEGRADED); -+ if (dev -+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, -+ BCH_FORCE_IF_DEGRADED) -+ : bch2_fs_emergency_read_only(c)) -+ bch_err(ca, -+ "too many IO errors, setting %s RO", -+ dev ? "device" : "filesystem"); -+ up_write(&c->state_lock); -+} -+ -+void bch2_io_error(struct bch_dev *ca) -+{ -+ //queue_work(system_long_wq, &ca->io_error_work); -+} -+ -+enum ask_yn { -+ YN_NO, -+ YN_YES, -+ YN_ALLNO, -+ YN_ALLYES, -+}; -+ -+#ifdef __KERNEL__ -+#define bch2_fsck_ask_yn() YN_NO -+#else -+ -+#include "tools-util.h" -+ -+enum ask_yn bch2_fsck_ask_yn(void) -+{ -+ char *buf = NULL; -+ size_t buflen = 0; -+ bool ret; -+ -+ while (true) { -+ fputs(" (y,n, or Y,N for all errors of this type) ", stdout); -+ fflush(stdout); -+ -+ if (getline(&buf, &buflen, stdin) < 0) -+ die("error reading from standard input"); -+ -+ strim(buf); -+ if (strlen(buf) != 1) -+ continue; -+ -+ switch (buf[0]) { -+ case 'n': -+ return YN_NO; -+ case 'y': -+ return YN_YES; -+ case 'N': -+ return YN_ALLNO; -+ case 'Y': -+ return YN_ALLYES; -+ } -+ } -+ -+ free(buf); -+ return ret; -+} -+ -+#endif -+ -+static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) -+{ -+ struct fsck_err_state *s; -+ -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) -+ return NULL; -+ -+ list_for_each_entry(s, &c->fsck_errors, list) -+ if (s->fmt == fmt) { -+ /* -+ * move it to the head of the list: repeated fsck errors -+ * are common -+ */ -+ list_move(&s->list, &c->fsck_errors); -+ return s; -+ } -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS); -+ if (!s) { -+ if (!c->fsck_alloc_err) -+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); -+ c->fsck_alloc_err = true; -+ return NULL; -+ } -+ -+ INIT_LIST_HEAD(&s->list); -+ s->fmt = fmt; -+ list_add(&s->list, &c->fsck_errors); -+ return s; -+} -+ -+int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) -+{ -+ struct fsck_err_state *s = NULL; -+ va_list args; -+ bool print = true, suppressing = false, inconsistent = false; -+ struct printbuf buf = PRINTBUF, *out = &buf; -+ int ret = -BCH_ERR_fsck_ignore; -+ -+ va_start(args, fmt); -+ prt_vprintf(out, fmt, args); -+ va_end(args); -+ -+ mutex_lock(&c->fsck_error_lock); -+ s = fsck_err_get(c, fmt); -+ if (s) { -+ /* -+ * We may be called multiple times for the same error on -+ * transaction restart - this memoizes instead of asking the user -+ * multiple times for the same error: -+ */ -+ if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { -+ ret = s->ret; -+ mutex_unlock(&c->fsck_error_lock); -+ printbuf_exit(&buf); -+ return ret; -+ } -+ -+ kfree(s->last_msg); -+ s->last_msg = kstrdup(buf.buf, GFP_KERNEL); -+ -+ if (c->opts.ratelimit_errors && -+ !(flags & FSCK_NO_RATELIMIT) && -+ s->nr >= FSCK_ERR_RATELIMIT_NR) { -+ if (s->nr == FSCK_ERR_RATELIMIT_NR) -+ suppressing = true; -+ else -+ print = false; -+ } -+ -+ s->nr++; -+ } -+ -+#ifdef BCACHEFS_LOG_PREFIX -+ if (!strncmp(fmt, "bcachefs:", 9)) -+ prt_printf(out, bch2_log_msg(c, "")); -+#endif -+ -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { -+ if (c->opts.errors != BCH_ON_ERROR_continue || -+ !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { -+ prt_str(out, ", shutting down"); -+ inconsistent = true; -+ ret = -BCH_ERR_fsck_errors_not_fixed; -+ } else if (flags & FSCK_CAN_FIX) { -+ prt_str(out, ", fixing"); -+ ret = -BCH_ERR_fsck_fix; -+ } else { -+ prt_str(out, ", continuing"); -+ ret = -BCH_ERR_fsck_ignore; -+ } -+ } else if (c->opts.fix_errors == FSCK_FIX_exit) { -+ prt_str(out, ", exiting"); -+ ret = -BCH_ERR_fsck_errors_not_fixed; -+ } else if (flags & FSCK_CAN_FIX) { -+ int fix = s && s->fix -+ ? s->fix -+ : c->opts.fix_errors; -+ -+ if (fix == FSCK_FIX_ask) { -+ int ask; -+ -+ prt_str(out, ": fix?"); -+ bch2_print_string_as_lines(KERN_ERR, out->buf); -+ print = false; -+ -+ ask = bch2_fsck_ask_yn(); -+ -+ if (ask >= YN_ALLNO && s) -+ s->fix = ask == YN_ALLNO -+ ? FSCK_FIX_no -+ : FSCK_FIX_yes; -+ -+ ret = ask & 1 -+ ? -BCH_ERR_fsck_fix -+ : -BCH_ERR_fsck_ignore; -+ } else if (fix == FSCK_FIX_yes || -+ (c->opts.nochanges && -+ !(flags & FSCK_CAN_IGNORE))) { -+ prt_str(out, ", fixing"); -+ ret = -BCH_ERR_fsck_fix; -+ } else { -+ prt_str(out, ", not fixing"); -+ } -+ } else if (flags & FSCK_NEED_FSCK) { -+ prt_str(out, " (run fsck to correct)"); -+ } else { -+ prt_str(out, " (repair unimplemented)"); -+ } -+ -+ if (ret == -BCH_ERR_fsck_ignore && -+ (c->opts.fix_errors == FSCK_FIX_exit || -+ !(flags & FSCK_CAN_IGNORE))) -+ ret = -BCH_ERR_fsck_errors_not_fixed; -+ -+ if (print) -+ bch2_print_string_as_lines(KERN_ERR, out->buf); -+ -+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && -+ (ret != -BCH_ERR_fsck_fix && -+ ret != -BCH_ERR_fsck_ignore)) -+ bch_err(c, "Unable to continue, halting"); -+ else if (suppressing) -+ bch_err(c, "Ratelimiting new instances of previous error"); -+ -+ if (s) -+ s->ret = ret; -+ -+ mutex_unlock(&c->fsck_error_lock); -+ -+ printbuf_exit(&buf); -+ -+ if (inconsistent) -+ bch2_inconsistent_error(c); -+ -+ if (ret == -BCH_ERR_fsck_fix) { -+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ } else { -+ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); -+ set_bit(BCH_FS_ERROR, &c->flags); -+ } -+ -+ return ret; -+} -+ -+void bch2_flush_fsck_errs(struct bch_fs *c) -+{ -+ struct fsck_err_state *s, *n; -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { -+ if (s->ratelimited && s->last_msg) -+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); -+ -+ list_del(&s->list); -+ kfree(s->last_msg); -+ kfree(s); -+ } -+ -+ mutex_unlock(&c->fsck_error_lock); -+} -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -new file mode 100644 -index 000000000..7ce954005 ---- /dev/null -+++ b/fs/bcachefs/error.h -@@ -0,0 +1,206 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ERROR_H -+#define _BCACHEFS_ERROR_H -+ -+#include -+#include -+ -+struct bch_dev; -+struct bch_fs; -+struct work_struct; -+ -+/* -+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag -+ * superblock as such -+ */ -+ -+/* Error messages: */ -+ -+/* -+ * Inconsistency errors: The on disk data is inconsistent. If these occur during -+ * initial recovery, they don't indicate a bug in the running code - we walk all -+ * the metadata before modifying anything. If they occur at runtime, they -+ * indicate either a bug in the running code or (less likely) data is being -+ * silently corrupted under us. -+ * -+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in -+ * BCH_ON_ERROR_CONTINUE mode -+ */ -+ -+bool bch2_inconsistent_error(struct bch_fs *); -+ -+void bch2_topology_error(struct bch_fs *); -+ -+#define bch2_fs_inconsistent(c, ...) \ -+({ \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_inconsistent_error(c); \ -+}) -+ -+#define bch2_fs_inconsistent_on(cond, c, ...) \ -+({ \ -+ bool _ret = unlikely(!!(cond)); \ -+ \ -+ if (_ret) \ -+ bch2_fs_inconsistent(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Later we might want to mark only the particular device inconsistent, not the -+ * entire filesystem: -+ */ -+ -+#define bch2_dev_inconsistent(ca, ...) \ -+do { \ -+ bch_err(ca, __VA_ARGS__); \ -+ bch2_inconsistent_error((ca)->fs); \ -+} while (0) -+ -+#define bch2_dev_inconsistent_on(cond, ca, ...) \ -+({ \ -+ bool _ret = unlikely(!!(cond)); \ -+ \ -+ if (_ret) \ -+ bch2_dev_inconsistent(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * When a transaction update discovers or is causing a fs inconsistency, it's -+ * helpful to also dump the pending updates: -+ */ -+#define bch2_trans_inconsistent(trans, ...) \ -+({ \ -+ bch_err(trans->c, __VA_ARGS__); \ -+ bch2_dump_trans_updates(trans); \ -+ bch2_inconsistent_error(trans->c); \ -+}) -+ -+#define bch2_trans_inconsistent_on(cond, trans, ...) \ -+({ \ -+ bool _ret = unlikely(!!(cond)); \ -+ \ -+ if (_ret) \ -+ bch2_trans_inconsistent(trans, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally -+ * be able to repair: -+ */ -+ -+struct fsck_err_state { -+ struct list_head list; -+ const char *fmt; -+ u64 nr; -+ bool ratelimited; -+ int ret; -+ int fix; -+ char *last_msg; -+}; -+ -+#define FSCK_CAN_FIX (1 << 0) -+#define FSCK_CAN_IGNORE (1 << 1) -+#define FSCK_NEED_FSCK (1 << 2) -+#define FSCK_NO_RATELIMIT (1 << 3) -+ -+__printf(3, 4) __cold -+int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); -+void bch2_flush_fsck_errs(struct bch_fs *); -+ -+#define __fsck_err(c, _flags, msg, ...) \ -+({ \ -+ int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ -+ \ -+ if (_ret != -BCH_ERR_fsck_fix && \ -+ _ret != -BCH_ERR_fsck_ignore) { \ -+ ret = _ret; \ -+ goto fsck_err; \ -+ } \ -+ \ -+ _ret == -BCH_ERR_fsck_fix; \ -+}) -+ -+/* These macros return true if error should be fixed: */ -+ -+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -+ -+#define __fsck_err_on(cond, c, _flags, ...) \ -+ (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) -+ -+#define need_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define need_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+#define fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+/* -+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW -+ * mode - pretty much just due to metadata IO errors: -+ */ -+ -+void bch2_fatal_error(struct bch_fs *); -+ -+#define bch2_fs_fatal_error(c, ...) \ -+do { \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_fatal_error(c); \ -+} while (0) -+ -+#define bch2_fs_fatal_err_on(cond, c, ...) \ -+({ \ -+ bool _ret = unlikely(!!(cond)); \ -+ \ -+ if (_ret) \ -+ bch2_fs_fatal_error(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * IO errors: either recoverable metadata IO (because we have replicas), or data -+ * IO - we need to log it and print out a message, but we don't (necessarily) -+ * want to shut down the fs: -+ */ -+ -+void bch2_io_error_work(struct work_struct *); -+ -+/* Does the error handling without logging a message */ -+void bch2_io_error(struct bch_dev *); -+ -+#define bch2_dev_io_err_on(cond, ca, ...) \ -+({ \ -+ bool _ret = (cond); \ -+ \ -+ if (_ret) { \ -+ bch_err_dev_ratelimited(ca, __VA_ARGS__); \ -+ bch2_io_error(ca); \ -+ } \ -+ _ret; \ -+}) -+ -+#define bch2_dev_inum_io_err_on(cond, ca, ...) \ -+({ \ -+ bool _ret = (cond); \ -+ \ -+ if (_ret) { \ -+ bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ -+ bch2_io_error(ca); \ -+ } \ -+ _ret; \ -+}) -+ -+#endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -new file mode 100644 -index 000000000..21af6fb8c ---- /dev/null -+++ b/fs/bcachefs/extent_update.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "debug.h" -+#include "extents.h" -+#include "extent_update.h" -+ -+/* -+ * This counts the number of iterators to the alloc & ec btrees we'll need -+ * inserting/removing this extent: -+ */ -+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ unsigned ret = 0, lru = 0; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ /* Might also be updating LRU btree */ -+ if (entry->ptr.cached) -+ lru++; -+ -+ fallthrough; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ret++; -+ } -+ } -+ -+ /* -+ * Updating keys in the alloc btree may also update keys in the -+ * freespace or discard btrees: -+ */ -+ return lru + ret * 2; -+} -+ -+static int count_iters_for_insert(struct btree_trans *trans, -+ struct bkey_s_c k, -+ unsigned offset, -+ struct bpos *end, -+ unsigned *nr_iters, -+ unsigned max_iters) -+{ -+ int ret = 0, ret2 = 0; -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ break; -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx = le64_to_cpu(p.v->idx); -+ unsigned sectors = bpos_min(*end, p.k->p).offset - -+ bkey_start_offset(p.k); -+ struct btree_iter iter; -+ struct bkey_s_c r_k; -+ -+ for_each_btree_key_norestart(trans, iter, -+ BTREE_ID_reflink, POS(0, idx + offset), -+ BTREE_ITER_SLOTS, r_k, ret2) { -+ if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) -+ break; -+ -+ /* extent_update_to_keys(), for the reflink_v update */ -+ *nr_iters += 1; -+ -+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); -+ -+ if (*nr_iters >= max_iters) { -+ struct bpos pos = bkey_start_pos(k.k); -+ pos.offset += min_t(u64, k.k->size, -+ r_k.k->p.offset - idx); -+ -+ *end = bpos_min(*end, pos); -+ ret = 1; -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ break; -+ } -+ } -+ -+ return ret2 ?: ret; -+} -+ -+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -+ -+int bch2_extent_atomic_end(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bpos *end) -+{ -+ struct btree_iter copy; -+ struct bkey_s_c k; -+ unsigned nr_iters = 0; -+ int ret; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ *end = insert->k.p; -+ -+ /* extent_update_to_keys(): */ -+ nr_iters += 1; -+ -+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, -+ &nr_iters, EXTENT_ITERS_MAX / 2); -+ if (ret < 0) -+ return ret; -+ -+ bch2_trans_copy_iter(©, iter); -+ -+ for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { -+ unsigned offset = 0; -+ -+ if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ -+ /* extent_handle_overwrites(): */ -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ case BCH_EXTENT_OVERLAP_FRONT: -+ nr_iters += 1; -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ nr_iters += 2; -+ break; -+ } -+ -+ ret = count_iters_for_insert(trans, k, offset, end, -+ &nr_iters, EXTENT_ITERS_MAX); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_iter_exit(trans, ©); -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_extent_trim_atomic(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *k) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(trans, iter, k, &end); -+ if (ret) -+ return ret; -+ -+ bch2_cut_back(end, k); -+ return 0; -+} -diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h -new file mode 100644 -index 000000000..6f5cf4493 ---- /dev/null -+++ b/fs/bcachefs/extent_update.h -@@ -0,0 +1,12 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENT_UPDATE_H -+#define _BCACHEFS_EXTENT_UPDATE_H -+ -+#include "bcachefs.h" -+ -+int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct bpos *); -+int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *); -+ -+#endif /* _BCACHEFS_EXTENT_UPDATE_H */ -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -new file mode 100644 -index 000000000..1b25f84e4 ---- /dev/null -+++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1403 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * -+ * Code for managing the extent btree and dynamically updating the writeback -+ * dirty sector count. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "trace.h" -+#include "util.h" -+ -+static unsigned bch2_crc_field_size_max[] = { -+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -+}; -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *, -+ struct bch_extent_crc_unpacked, -+ enum bch_extent_entry_type); -+ -+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, -+ unsigned dev) -+{ -+ struct bch_dev_io_failures *i; -+ -+ for (i = f->devs; i < f->devs + f->nr; i++) -+ if (i->dev == dev) -+ return i; -+ -+ return NULL; -+} -+ -+void bch2_mark_io_failure(struct bch_io_failures *failed, -+ struct extent_ptr_decoded *p) -+{ -+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); -+ -+ if (!f) { -+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); -+ -+ f = &failed->devs[failed->nr++]; -+ f->dev = p->ptr.dev; -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else if (p->idx != f->idx) { -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else { -+ f->nr_failed++; -+ } -+} -+ -+/* -+ * returns true if p1 is better than p2: -+ */ -+static inline bool ptr_better(struct bch_fs *c, -+ const struct extent_ptr_decoded p1, -+ const struct extent_ptr_decoded p2) -+{ -+ if (likely(!p1.idx && !p2.idx)) { -+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); -+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); -+ -+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); -+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); -+ -+ /* Pick at random, biased in favor of the faster device: */ -+ -+ return bch2_rand_range(l1 + l2) > l1; -+ } -+ -+ if (bch2_force_reconstruct_read) -+ return p1.idx > p2.idx; -+ -+ return p1.idx < p2.idx; -+} -+ -+/* -+ * This picks a non-stale pointer, preferably from a device other than @avoid. -+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to -+ * other devices, it will still pick a pointer from avoid. -+ */ -+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_io_failures *failed, -+ struct extent_ptr_decoded *pick) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_dev_io_failures *f; -+ struct bch_dev *ca; -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_error) -+ return -EIO; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ /* -+ * Unwritten extent: no need to actually read, treat it as a -+ * hole and return 0s: -+ */ -+ if (p.ptr.unwritten) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ /* -+ * If there are any dirty pointers it's an error if we can't -+ * read: -+ */ -+ if (!ret && !p.ptr.cached) -+ ret = -EIO; -+ -+ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) -+ continue; -+ -+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; -+ if (f) -+ p.idx = f->nr_failed < f->nr_retries -+ ? f->idx -+ : f->idx + 1; -+ -+ if (!p.idx && -+ !bch2_dev_is_readable(ca)) -+ p.idx++; -+ -+ if (bch2_force_reconstruct_read && -+ !p.idx && p.has_ec) -+ p.idx++; -+ -+ if (p.idx >= (unsigned) p.has_ec + 1) -+ continue; -+ -+ if (ret > 0 && !ptr_better(c, p, *pick)) -+ continue; -+ -+ *pick = p; -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), BCH_REPLICAS_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); -+} -+ -+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { -+ prt_printf(err, "value too big (%zu > %zu)", -+ bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); -+} -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ prt_printf(out, "seq %llx written %u min_key %s", -+ le64_to_cpu(bp.v->seq), -+ le16_to_cpu(bp.v->sectors_written), -+ BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); -+ -+ bch2_bpos_to_text(out, bp.v->min_key); -+ prt_printf(out, " "); -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s k) -+{ -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); -+ -+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id_is_extents(btree_id) && -+ !bkey_eq(bp.v->min_key, POS_MIN)) -+ bp.v->min_key = write -+ ? bpos_nosnap_predecessor(bp.v->min_key) -+ : bpos_nosnap_successor(bp.v->min_key); -+} -+ -+/* KEY_TYPE_extent: */ -+ -+bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) -+{ -+ struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); -+ struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); -+ union bch_extent_entry *en_l; -+ const union bch_extent_entry *en_r; -+ struct extent_ptr_decoded lp, rp; -+ bool use_right_ptr; -+ struct bch_dev *ca; -+ -+ en_l = l_ptrs.start; -+ en_r = r_ptrs.start; -+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) { -+ if (extent_entry_type(en_l) != extent_entry_type(en_r)) -+ return false; -+ -+ en_l = extent_entry_next(en_l); -+ en_r = extent_entry_next(en_r); -+ } -+ -+ if (en_l < l_ptrs.end || en_r < r_ptrs.end) -+ return false; -+ -+ en_l = l_ptrs.start; -+ en_r = r_ptrs.start; -+ lp.crc = bch2_extent_crc_unpack(l.k, NULL); -+ rp.crc = bch2_extent_crc_unpack(r.k, NULL); -+ -+ while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && -+ __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { -+ if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != -+ rp.ptr.offset + rp.crc.offset || -+ lp.ptr.dev != rp.ptr.dev || -+ lp.ptr.gen != rp.ptr.gen || -+ lp.ptr.unwritten != rp.ptr.unwritten || -+ lp.has_ec != rp.has_ec) -+ return false; -+ -+ /* Extents may not straddle buckets: */ -+ ca = bch_dev_bkey_exists(c, lp.ptr.dev); -+ if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) -+ return false; -+ -+ if (lp.has_ec != rp.has_ec || -+ (lp.has_ec && -+ (lp.ec.block != rp.ec.block || -+ lp.ec.redundancy != rp.ec.redundancy || -+ lp.ec.idx != rp.ec.idx))) -+ return false; -+ -+ if (lp.crc.compression_type != rp.crc.compression_type || -+ lp.crc.nonce != rp.crc.nonce) -+ return false; -+ -+ if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= -+ lp.crc.uncompressed_size) { -+ /* can use left extent's crc entry */ -+ } else if (lp.crc.live_size <= rp.crc.offset) { -+ /* can use right extent's crc entry */ -+ } else { -+ /* check if checksums can be merged: */ -+ if (lp.crc.csum_type != rp.crc.csum_type || -+ lp.crc.nonce != rp.crc.nonce || -+ crc_is_compressed(lp.crc) || -+ !bch2_checksum_mergeable(lp.crc.csum_type)) -+ return false; -+ -+ if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || -+ rp.crc.offset) -+ return false; -+ -+ if (lp.crc.csum_type && -+ lp.crc.uncompressed_size + -+ rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) -+ return false; -+ } -+ -+ en_l = extent_entry_next(en_l); -+ en_r = extent_entry_next(en_r); -+ } -+ -+ en_l = l_ptrs.start; -+ en_r = r_ptrs.start; -+ while (en_l < l_ptrs.end && en_r < r_ptrs.end) { -+ if (extent_entry_is_crc(en_l)) { -+ struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ if (crc_l.uncompressed_size + crc_r.uncompressed_size > -+ bch2_crc_field_size_max[extent_entry_type(en_l)]) -+ return false; -+ } -+ -+ en_l = extent_entry_next(en_l); -+ en_r = extent_entry_next(en_r); -+ } -+ -+ use_right_ptr = false; -+ en_l = l_ptrs.start; -+ en_r = r_ptrs.start; -+ while (en_l < l_ptrs.end) { -+ if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && -+ use_right_ptr) -+ en_l->ptr = en_r->ptr; -+ -+ if (extent_entry_is_crc(en_l)) { -+ struct bch_extent_crc_unpacked crc_l = -+ bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ struct bch_extent_crc_unpacked crc_r = -+ bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ use_right_ptr = false; -+ -+ if (crc_l.offset + crc_l.live_size + crc_r.live_size <= -+ crc_l.uncompressed_size) { -+ /* can use left extent's crc entry */ -+ } else if (crc_l.live_size <= crc_r.offset) { -+ /* can use right extent's crc entry */ -+ crc_r.offset -= crc_l.live_size; -+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, -+ extent_entry_type(en_l)); -+ use_right_ptr = true; -+ } else { -+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, -+ crc_l.csum, -+ crc_r.csum, -+ crc_r.uncompressed_size << 9); -+ -+ crc_l.uncompressed_size += crc_r.uncompressed_size; -+ crc_l.compressed_size += crc_r.compressed_size; -+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, -+ extent_entry_type(en_l)); -+ } -+ } -+ -+ en_l = extent_entry_next(en_l); -+ en_r = extent_entry_next(en_r); -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ return true; -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { -+ prt_printf(err, "invalid nr_replicas (%u)", -+ r.v->nr_replicas); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ prt_printf(out, "generation %u replicas %u", -+ le32_to_cpu(r.v->generation), -+ r.v->nr_replicas); -+} -+ -+bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_reservation l = bkey_s_to_reservation(_l); -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); -+ -+ if (l.v->generation != r.v->generation || -+ l.v->nr_replicas != r.v->nr_replicas) -+ return false; -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ return true; -+} -+ -+/* Extent checksum entries: */ -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, -+ struct bch_extent_crc_unpacked r) -+{ -+ return (l.csum_type != r.csum_type || -+ l.compression_type != r.compression_type || -+ l.compressed_size != r.compressed_size || -+ l.uncompressed_size != r.uncompressed_size || -+ l.offset != r.offset || -+ l.live_size != r.live_size || -+ l.nonce != r.nonce || -+ bch2_crc_cmp(l.csum, r.csum)); -+} -+ -+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, -+ struct bch_extent_crc_unpacked n) -+{ -+ return !crc_is_compressed(u) && -+ u.csum_type && -+ u.uncompressed_size > u.live_size && -+ bch2_csum_type_is_encryption(u.csum_type) == -+ bch2_csum_type_is_encryption(n.csum_type); -+} -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, -+ struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ if (!n.csum_type) -+ return false; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (can_narrow_crc(crc, n)) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * We're writing another replica for this extent, so while we've got the data in -+ * memory we'll be computing a new checksum for the currently live data. -+ * -+ * If there are other replicas we aren't moving, and they are checksummed but -+ * not compressed, we can modify them to point to only the data that is -+ * currently live (so that readers won't have to bounce) while we've got the -+ * checksum we need: -+ */ -+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked u; -+ struct extent_ptr_decoded p; -+ union bch_extent_entry *i; -+ bool ret = false; -+ -+ /* Find a checksum entry that covers only live data: */ -+ if (!n.csum_type) { -+ bkey_for_each_crc(&k->k, ptrs, u, i) -+ if (!crc_is_compressed(u) && -+ u.csum_type && -+ u.live_size == u.uncompressed_size) { -+ n = u; -+ goto found; -+ } -+ return false; -+ } -+found: -+ BUG_ON(crc_is_compressed(n)); -+ BUG_ON(n.offset); -+ BUG_ON(n.live_size != k->k.size); -+ -+restart_narrow_pointers: -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ -+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) -+ if (can_narrow_crc(p.crc, n)) { -+ bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); -+ p.ptr.offset += p.crc.offset; -+ p.crc = n; -+ bch2_extent_ptr_decoded_append(k, &p); -+ ret = true; -+ goto restart_narrow_pointers; -+ } -+ -+ return ret; -+} -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *dst, -+ struct bch_extent_crc_unpacked src, -+ enum bch_extent_entry_type type) -+{ -+#define set_common_fields(_dst, _src) \ -+ _dst.type = 1 << type; \ -+ _dst.csum_type = _src.csum_type, \ -+ _dst.compression_type = _src.compression_type, \ -+ _dst._compressed_size = _src.compressed_size - 1, \ -+ _dst._uncompressed_size = _src.uncompressed_size - 1, \ -+ _dst.offset = _src.offset -+ -+ switch (type) { -+ case BCH_EXTENT_ENTRY_crc32: -+ set_common_fields(dst->crc32, src); -+ dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ set_common_fields(dst->crc64, src); -+ dst->crc64.nonce = src.nonce; -+ dst->crc64.csum_lo = (u64 __force) src.csum.lo; -+ dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ set_common_fields(dst->crc128, src); -+ dst->crc128.nonce = src.nonce; -+ dst->crc128.csum = src.csum; -+ break; -+ default: -+ BUG(); -+ } -+#undef set_common_fields -+} -+ -+void bch2_extent_crc_append(struct bkey_i *k, -+ struct bch_extent_crc_unpacked new) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ union bch_extent_crc *crc = (void *) ptrs.end; -+ enum bch_extent_entry_type type; -+ -+ if (bch_crc_bytes[new.csum_type] <= 4 && -+ new.uncompressed_size <= CRC32_SIZE_MAX && -+ new.nonce <= CRC32_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc32; -+ else if (bch_crc_bytes[new.csum_type] <= 10 && -+ new.uncompressed_size <= CRC64_SIZE_MAX && -+ new.nonce <= CRC64_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc64; -+ else if (bch_crc_bytes[new.csum_type] <= 16 && -+ new.uncompressed_size <= CRC128_SIZE_MAX && -+ new.nonce <= CRC128_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc128; -+ else -+ BUG(); -+ -+ bch2_extent_crc_pack(crc, new, type); -+ -+ k->k.u64s += extent_entry_u64s(ptrs.end); -+ -+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -+} -+ -+/* Generic code for keys with pointers: */ -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -+{ -+ return bch2_bkey_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -+{ -+ return k.k->type == KEY_TYPE_reservation -+ ? bkey_s_c_to_reservation(k).v->nr_replicas -+ : bch2_bkey_dirty_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -+{ -+ unsigned ret = 0; -+ -+ if (k.k->type == KEY_TYPE_reservation) { -+ ret = bkey_s_c_to_reservation(k).v->nr_replicas; -+ } else { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ ret += !p.ptr.cached && !crc_is_compressed(p.crc); -+ } -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned ret = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && crc_is_compressed(p.crc)) -+ ret += p.crc.compressed_size; -+ -+ return ret; -+} -+ -+bool bch2_bkey_is_incompressible(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, entry) -+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) -+ return true; -+ return false; -+} -+ -+unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p = { 0 }; -+ unsigned replicas = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ continue; -+ -+ if (p.has_ec) -+ replicas += p.ec.redundancy; -+ -+ replicas++; -+ -+ } -+ -+ return replicas; -+} -+ -+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -+{ -+ struct bch_dev *ca; -+ -+ if (p->ptr.cached) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p->ptr.dev); -+ -+ return ca->mi.durability + -+ (p->has_ec -+ ? p->ec.redundancy -+ : 0); -+} -+ -+unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) -+{ -+ struct bch_dev *ca; -+ -+ if (p->ptr.cached) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p->ptr.dev); -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_failed) -+ return 0; -+ -+ return ca->mi.durability + -+ (p->has_ec -+ ? p->ec.redundancy -+ : 0); -+} -+ -+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned durability = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ durability += bch2_extent_ptr_durability(c, &p); -+ -+ return durability; -+} -+ -+static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned durability = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) -+ durability += bch2_extent_ptr_durability(c, &p); -+ -+ return durability; -+} -+ -+void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); -+ union bch_extent_entry *next = extent_entry_next(entry); -+ -+ memmove_u64s(entry, next, (u64 *) end - (u64 *) next); -+ k->k.u64s -= extent_entry_u64s(entry); -+} -+ -+void bch2_extent_ptr_decoded_append(struct bkey_i *k, -+ struct extent_ptr_decoded *p) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked crc = -+ bch2_extent_crc_unpack(&k->k, NULL); -+ union bch_extent_entry *pos; -+ -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = ptrs.start; -+ goto found; -+ } -+ -+ bkey_for_each_crc(&k->k, ptrs, crc, pos) -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = extent_entry_next(pos); -+ goto found; -+ } -+ -+ bch2_extent_crc_append(k, p->crc); -+ pos = bkey_val_end(bkey_i_to_s(k)); -+found: -+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ptr)); -+ -+ if (p->has_ec) { -+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ec)); -+ } -+} -+ -+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, -+ union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *i = ptrs.start; -+ -+ if (i == entry) -+ return NULL; -+ -+ while (extent_entry_next(i) != entry) -+ i = extent_entry_next(i); -+ return i; -+} -+ -+static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *next = extent_entry_next(entry); -+ -+ /* stripes have ptrs, but their layout doesn't work with this code */ -+ BUG_ON(k.k->type == KEY_TYPE_stripe); -+ -+ memmove_u64s_down(entry, next, -+ (u64 *) bkey_val_end(k) - (u64 *) next); -+ k.k->u64s -= (u64 *) next - (u64 *) entry; -+} -+ -+/* -+ * Returns pointer to the next entry after the one being dropped: -+ */ -+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, -+ struct bch_extent_ptr *ptr) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry = to_entry(ptr), *next; -+ union bch_extent_entry *ret = entry; -+ bool drop_crc = true; -+ -+ EBUG_ON(ptr < &ptrs.start->ptr || -+ ptr >= &ptrs.end->ptr); -+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); -+ -+ for (next = extent_entry_next(entry); -+ next != ptrs.end; -+ next = extent_entry_next(next)) { -+ if (extent_entry_is_crc(next)) { -+ break; -+ } else if (extent_entry_is_ptr(next)) { -+ drop_crc = false; -+ break; -+ } -+ } -+ -+ extent_entry_drop(k, entry); -+ -+ while ((entry = extent_entry_prev(ptrs, entry))) { -+ if (extent_entry_is_ptr(entry)) -+ break; -+ -+ if ((extent_entry_is_crc(entry) && drop_crc) || -+ extent_entry_is_stripe_ptr(entry)) { -+ ret = (void *) ret - extent_entry_bytes(entry); -+ extent_entry_drop(k, entry); -+ } -+ } -+ -+ return ret; -+} -+ -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, -+ struct bch_extent_ptr *ptr) -+{ -+ bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; -+ union bch_extent_entry *ret = -+ bch2_bkey_drop_ptr_noerror(k, ptr); -+ -+ /* -+ * If we deleted all the dirty pointers and there's still cached -+ * pointers, we could set the cached pointers to dirty if they're not -+ * stale - but to do that correctly we'd need to grab an open_bucket -+ * reference so that we don't race with bucket reuse: -+ */ -+ if (have_dirty && -+ !bch2_bkey_dirty_devs(k.s_c).nr) { -+ k.k->type = KEY_TYPE_error; -+ set_bkey_val_u64s(k.k, 0); -+ ret = NULL; -+ } else if (!bch2_bkey_nr_ptrs(k.s_c)) { -+ k.k->type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(k.k, 0); -+ ret = NULL; -+ } -+ -+ return ret; -+} -+ -+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -+} -+ -+void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) -+{ -+ struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); -+ -+ if (ptr) -+ bch2_bkey_drop_ptr_noerror(k, ptr); -+} -+ -+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (ptr->dev == dev) -+ return ptr; -+ -+ return NULL; -+} -+ -+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (bch2_dev_in_target(c, ptr->dev, target) && -+ (!ptr->cached || -+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) -+ return true; -+ -+ return false; -+} -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_extent_ptr m, u64 offset) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == m.dev && -+ p.ptr.gen == m.gen && -+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == -+ (s64) m.offset - offset) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * Returns true if two extents refer to the same data: -+ */ -+bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) -+{ -+ if (k1.k->type != k2.k->type) -+ return false; -+ -+ if (bkey_extent_is_direct_data(k1.k)) { -+ struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); -+ struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); -+ const union bch_extent_entry *entry1, *entry2; -+ struct extent_ptr_decoded p1, p2; -+ -+ if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) -+ return false; -+ -+ bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) -+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) -+ if (p1.ptr.dev == p2.ptr.dev && -+ p1.ptr.gen == p2.ptr.gen && -+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == -+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) -+ return true; -+ -+ return false; -+ } else { -+ /* KEY_TYPE_deleted, etc. */ -+ return true; -+ } -+} -+ -+struct bch_extent_ptr * -+bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) -+{ -+ struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); -+ union bch_extent_entry *entry2; -+ struct extent_ptr_decoded p2; -+ -+ bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) -+ if (p1.ptr.dev == p2.ptr.dev && -+ p1.ptr.gen == p2.ptr.gen && -+ (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == -+ (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) -+ return &entry2->ptr; -+ -+ return NULL; -+} -+ -+void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ union bch_extent_entry *ec = NULL; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (&entry->ptr == ptr) { -+ ptr->cached = true; -+ if (ec) -+ extent_entry_drop(k, ec); -+ return; -+ } -+ -+ if (extent_entry_is_stripe_ptr(entry)) -+ ec = entry; -+ else if (extent_entry_is_ptr(entry)) -+ ec = NULL; -+ } -+ -+ BUG(); -+} -+ -+/* -+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. -+ * -+ * Returns true if @k should be dropped entirely -+ * -+ * For existing keys, only called when btree nodes are being rewritten, not when -+ * they're merely being compacted/resorted in memory. -+ */ -+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, -+ ptr->cached && -+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); -+ -+ return bkey_deleted(k.k); -+} -+ -+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ const struct bch_extent_ptr *ptr; -+ const struct bch_extent_stripe_ptr *ec; -+ struct bch_dev *ca; -+ bool first = true; -+ -+ if (c) -+ prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (!first) -+ prt_printf(out, " "); -+ -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ptr = entry_to_ptr(entry); -+ ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] -+ ? bch_dev_bkey_exists(c, ptr->dev) -+ : NULL; -+ -+ if (!ca) { -+ prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, -+ (u64) ptr->offset, ptr->gen, -+ ptr->cached ? " cached" : ""); -+ } else { -+ u32 offset; -+ u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); -+ -+ prt_printf(out, "ptr: %u:%llu:%u gen %u", -+ ptr->dev, b, offset, ptr->gen); -+ if (ptr->cached) -+ prt_str(out, " cached"); -+ if (ptr->unwritten) -+ prt_str(out, " unwritten"); -+ if (ca && ptr_stale(ca, ptr)) -+ prt_printf(out, " stale"); -+ } -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", -+ crc.compressed_size, -+ crc.uncompressed_size, -+ crc.offset, crc.nonce, -+ bch2_csum_types[crc.csum_type], -+ bch2_compression_types[crc.compression_type]); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ec = &entry->stripe_ptr; -+ -+ prt_printf(out, "ec: idx %llu block %u", -+ (u64) ec->idx, ec->block); -+ break; -+ default: -+ prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); -+ return; -+ } -+ -+ first = false; -+ } -+} -+ -+static int extent_ptr_invalid(const struct bch_fs *c, -+ struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ const struct bch_extent_ptr *ptr, -+ unsigned size_ondisk, -+ bool metadata, -+ struct printbuf *err) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr2; -+ u64 bucket; -+ u32 bucket_offset; -+ struct bch_dev *ca; -+ -+ if (!bch2_dev_exists2(c, ptr->dev)) { -+ /* -+ * If we're in the write path this key might have already been -+ * overwritten, and we could be seeing a device that doesn't -+ * exist anymore due to racing with device removal: -+ */ -+ if (flags & BKEY_INVALID_WRITE) -+ return 0; -+ -+ prt_printf(err, "pointer to invalid device (%u)", ptr->dev); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr != ptr2 && ptr->dev == ptr2->dev) { -+ prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); -+ -+ if (bucket >= ca->mi.nbuckets) { -+ prt_printf(err, "pointer past last bucket (%llu > %llu)", -+ bucket, ca->mi.nbuckets); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { -+ prt_printf(err, "pointer before first bucket (%llu < %u)", -+ bucket, ca->mi.first_bucket); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bucket_offset + size_ondisk > ca->mi.bucket_size) { -+ prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", -+ bucket_offset, size_ondisk, ca->mi.bucket_size); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ unsigned size_ondisk = k.k->size; -+ unsigned nonce = UINT_MAX; -+ unsigned nr_ptrs = 0; -+ bool unwritten = false, have_ec = false, crc_since_last_ptr = false; -+ int ret; -+ -+ if (bkey_is_btree_ptr(k.k)) -+ size_ondisk = btree_sectors(c); -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { -+ prt_printf(err, "invalid extent entry type (got %u, max %u)", -+ __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bkey_is_btree_ptr(k.k) && -+ !extent_entry_is_ptr(entry)) { -+ prt_printf(err, "has non ptr field"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ret = extent_ptr_invalid(c, k, flags, &entry->ptr, -+ size_ondisk, false, err); -+ if (ret) -+ return ret; -+ -+ if (nr_ptrs && unwritten != entry->ptr.unwritten) { -+ prt_printf(err, "extent with unwritten and written ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { -+ prt_printf(err, "has unwritten ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (entry->ptr.cached && have_ec) { -+ prt_printf(err, "cached, erasure coded ptr"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ unwritten = entry->ptr.unwritten; -+ have_ec = false; -+ crc_since_last_ptr = false; -+ nr_ptrs++; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ if (crc.offset + crc.live_size > -+ crc.uncompressed_size) { -+ prt_printf(err, "checksum offset + key size > uncompressed size"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ size_ondisk = crc.compressed_size; -+ -+ if (!bch2_checksum_type_valid(c, crc.csum_type)) { -+ prt_printf(err, "invalid checksum type"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { -+ prt_printf(err, "invalid compression type"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bch2_csum_type_is_encryption(crc.csum_type)) { -+ if (nonce == UINT_MAX) -+ nonce = crc.offset + crc.nonce; -+ else if (nonce != crc.offset + crc.nonce) { -+ prt_printf(err, "incorrect nonce"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ -+ if (crc_since_last_ptr) { -+ prt_printf(err, "redundant crc entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ crc_since_last_ptr = true; -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ if (have_ec) { -+ prt_printf(err, "redundant stripe entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ have_ec = true; -+ break; -+ case BCH_EXTENT_ENTRY_rebalance: -+ break; -+ } -+ } -+ -+ if (!nr_ptrs) { -+ prt_str(err, "no ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { -+ prt_str(err, "too many ptrs"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (crc_since_last_ptr) { -+ prt_printf(err, "redundant crc entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (have_ec) { -+ prt_printf(err, "redundant stripe entry"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_ptr_swab(struct bkey_s k) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ u64 *d; -+ -+ for (d = (u64 *) ptrs.start; -+ d != (u64 *) ptrs.end; -+ d++) -+ *d = swab64(*d); -+ -+ for (entry = ptrs.start; -+ entry < ptrs.end; -+ entry = extent_entry_next(entry)) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.csum = swab32(entry->crc32.csum); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); -+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.csum.hi = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.hi); -+ entry->crc128.csum.lo = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_rebalance: -+ break; -+ } -+ } -+} -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 sub; -+ -+ if (bkey_le(where, bkey_start_pos(k.k))) -+ return 0; -+ -+ EBUG_ON(bkey_gt(where, k.k->p)); -+ -+ sub = where.offset - bkey_start_offset(k.k); -+ -+ k.k->size -= sub; -+ -+ if (!k.k->size) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: { -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ bool seen_crc = false; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ if (!seen_crc) -+ entry->ptr.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_rebalance: -+ break; -+ } -+ -+ if (extent_entry_is_crc(entry)) -+ seen_crc = true; -+ } -+ -+ break; -+ } -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); -+ -+ le64_add_cpu(&p.v->idx, sub); -+ break; -+ } -+ case KEY_TYPE_inline_data: -+ case KEY_TYPE_indirect_inline_data: { -+ void *p = bkey_inline_data_p(k); -+ unsigned bytes = bkey_inline_data_bytes(k.k); -+ -+ sub = min_t(u64, sub << 9, bytes); -+ -+ memmove(p, p + sub, bytes - sub); -+ -+ new_val_u64s -= sub >> 3; -+ break; -+ } -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -+ -+int bch2_cut_back_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 len = 0; -+ -+ if (bkey_ge(where, k.k->p)) -+ return 0; -+ -+ EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); -+ -+ len = where.offset - bkey_start_offset(k.k); -+ -+ k.k->p.offset = where.offset; -+ k.k->size = len; -+ -+ if (!len) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inline_data: -+ case KEY_TYPE_indirect_inline_data: -+ new_val_u64s = (bkey_inline_data_offset(k.k) + -+ min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; -+ break; -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -new file mode 100644 -index 000000000..7ee8d031b ---- /dev/null -+++ b/fs/bcachefs/extents.h -@@ -0,0 +1,757 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_H -+#define _BCACHEFS_EXTENTS_H -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "extents_types.h" -+ -+struct bch_fs; -+struct btree_trans; -+enum bkey_invalid_flags; -+ -+/* extent entries: */ -+ -+#define extent_entry_last(_e) \ -+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) -+ -+#define entry_to_ptr(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ -+ \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const struct bch_extent_ptr *) (_entry), \ -+ (struct bch_extent_ptr *) (_entry)); \ -+}) -+ -+/* downcast, preserves const */ -+#define to_entry(_entry) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ -+ !type_is(_entry, struct bch_extent_ptr *) && \ -+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ -+ \ -+ __builtin_choose_expr( \ -+ (type_is_exact(_entry, const union bch_extent_crc *) || \ -+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ -+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ -+ (const union bch_extent_entry *) (_entry), \ -+ (union bch_extent_entry *) (_entry)); \ -+}) -+ -+#define extent_entry_next(_entry) \ -+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) -+ -+static inline unsigned -+__extent_entry_type(const union bch_extent_entry *e) -+{ -+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -+} -+ -+static inline enum bch_extent_entry_type -+extent_entry_type(const union bch_extent_entry *e) -+{ -+ int ret = __ffs(e->type); -+ -+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); -+ -+ return ret; -+} -+ -+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -+{ -+ switch (extent_entry_type(entry)) { -+#define x(f, n) \ -+ case BCH_EXTENT_ENTRY_##f: \ -+ return sizeof(struct bch_extent_##f); -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -+{ -+ return extent_entry_bytes(entry) / sizeof(u64); -+} -+ -+static inline void __extent_entry_insert(struct bkey_i *k, -+ union bch_extent_entry *dst, -+ union bch_extent_entry *new) -+{ -+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); -+ -+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), -+ dst, (u64 *) end - (u64 *) dst); -+ k->k.u64s += extent_entry_u64s(new); -+ memcpy_u64s_small(dst, new, extent_entry_u64s(new)); -+} -+ -+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -+{ -+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; -+} -+ -+static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) -+{ -+ return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; -+} -+ -+static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+union bch_extent_crc { -+ u8 type; -+ struct bch_extent_crc32 crc32; -+ struct bch_extent_crc64 crc64; -+ struct bch_extent_crc128 crc128; -+}; -+ -+#define __entry_to_crc(_entry) \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const union bch_extent_crc *) (_entry), \ -+ (union bch_extent_crc *) (_entry)) -+ -+#define entry_to_crc(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ -+ \ -+ __entry_to_crc(_entry); \ -+}) -+ -+static inline struct bch_extent_crc_unpacked -+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -+{ -+#define common_fields(_crc) \ -+ .csum_type = _crc.csum_type, \ -+ .compression_type = _crc.compression_type, \ -+ .compressed_size = _crc._compressed_size + 1, \ -+ .uncompressed_size = _crc._uncompressed_size + 1, \ -+ .offset = _crc.offset, \ -+ .live_size = k->size -+ -+ if (!crc) -+ return (struct bch_extent_crc_unpacked) { -+ .compressed_size = k->size, -+ .uncompressed_size = k->size, -+ .live_size = k->size, -+ }; -+ -+ switch (extent_entry_type(to_entry(crc))) { -+ case BCH_EXTENT_ENTRY_crc32: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc32), -+ }; -+ -+ *((__le32 *) &ret.csum.lo) = (__le32 __force) crc->crc32.csum; -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc64: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc64), -+ .nonce = crc->crc64.nonce, -+ .csum.lo = (__force __le64) crc->crc64.csum_lo, -+ }; -+ -+ *((__le16 *) &ret.csum.hi) = (__le16 __force) crc->crc64.csum_hi; -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc128: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc128), -+ .nonce = crc->crc128.nonce, -+ .csum = crc->crc128.csum, -+ }; -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+#undef common_fields -+} -+ -+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -+{ -+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && -+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -+} -+ -+/* bkey_ptrs: generically over any key type that has ptrs */ -+ -+struct bkey_ptrs_c { -+ const union bch_extent_entry *start; -+ const union bch_extent_entry *end; -+}; -+ -+struct bkey_ptrs { -+ union bch_extent_entry *start; -+ union bch_extent_entry *end; -+}; -+ -+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: { -+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); -+ -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ -+ return (struct bkey_ptrs_c) { -+ e.v->start, -+ extent_entry_last(e) -+ }; -+ } -+ case KEY_TYPE_stripe: { -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ -+ return (struct bkey_ptrs_c) { -+ to_entry(&s.v->ptrs[0]), -+ to_entry(&s.v->ptrs[s.v->nr_blocks]), -+ }; -+ } -+ case KEY_TYPE_reflink_v: { -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ return (struct bkey_ptrs_c) { -+ r.v->start, -+ bkey_val_end(r), -+ }; -+ } -+ case KEY_TYPE_btree_ptr_v2: { -+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); -+ -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ default: -+ return (struct bkey_ptrs_c) { NULL, NULL }; -+ } -+} -+ -+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -+{ -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); -+ -+ return (struct bkey_ptrs) { -+ (void *) p.start, -+ (void *) p.end -+ }; -+} -+ -+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ -+ for ((_entry) = (_start); \ -+ (_entry) < (_end); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define __bkey_ptr_next(_ptr, _end) \ -+({ \ -+ typeof(_end) _entry; \ -+ \ -+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ -+ if (extent_entry_is_ptr(_entry)) \ -+ break; \ -+ \ -+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -+}) -+ -+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) -+ -+#define bkey_extent_entry_for_each(_p, _entry) \ -+ bkey_extent_entry_for_each_from(_p, _entry, _p.start) -+ -+#define __bkey_for_each_ptr(_start, _end, _ptr) \ -+ for ((_ptr) = (_start); \ -+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ -+ (_ptr)++) -+ -+#define bkey_ptr_next(_p, _ptr) \ -+ __bkey_ptr_next(_ptr, (_p).end) -+ -+#define bkey_for_each_ptr(_p, _ptr) \ -+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) -+ -+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -+({ \ -+ __label__ out; \ -+ \ -+ (_ptr).idx = 0; \ -+ (_ptr).has_ec = false; \ -+ \ -+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ -+ switch (extent_entry_type(_entry)) { \ -+ case BCH_EXTENT_ENTRY_ptr: \ -+ (_ptr).ptr = _entry->ptr; \ -+ goto out; \ -+ case BCH_EXTENT_ENTRY_crc32: \ -+ case BCH_EXTENT_ENTRY_crc64: \ -+ case BCH_EXTENT_ENTRY_crc128: \ -+ (_ptr).crc = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_entry)); \ -+ break; \ -+ case BCH_EXTENT_ENTRY_stripe_ptr: \ -+ (_ptr).ec = _entry->stripe_ptr; \ -+ (_ptr).has_ec = true; \ -+ break; \ -+ default: \ -+ /* nothing */ \ -+ break; \ -+ } \ -+out: \ -+ _entry < (_end); \ -+}) -+ -+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ -+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ -+ (_entry) = _start; \ -+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ -+ _ptr, _entry) -+ -+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ -+({ \ -+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ -+ if (extent_entry_is_crc(_iter)) { \ -+ (_crc) = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_iter)); \ -+ break; \ -+ } \ -+ \ -+ (_iter) < (_end); \ -+}) -+ -+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ -+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ -+ (_iter) = (_start); \ -+ bkey_crc_next(_k, _start, _end, _crc, _iter); \ -+ (_iter) = extent_entry_next(_iter)) -+ -+#define bkey_for_each_crc(_k, _p, _crc, _iter) \ -+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) -+ -+/* Iterate over pointers in KEY_TYPE_extent: */ -+ -+#define extent_for_each_entry_from(_e, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, \ -+ extent_entry_last(_e), _entry) -+ -+#define extent_for_each_entry(_e, _entry) \ -+ extent_for_each_entry_from(_e, _entry, (_e).v->start) -+ -+#define extent_ptr_next(_e, _ptr) \ -+ __bkey_ptr_next(_ptr, extent_entry_last(_e)) -+ -+#define extent_for_each_ptr(_e, _ptr) \ -+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -+ -+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ -+ extent_entry_last(_e), _ptr, _entry) -+ -+/* utility code common to all keys with pointers: */ -+ -+void bch2_mark_io_failure(struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, -+ int, struct bkey_s); -+ -+#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .val_to_text = bch2_btree_ptr_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .trans_trigger = bch2_trans_mark_extent, \ -+ .atomic_trigger = bch2_mark_extent, \ -+}) -+ -+#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_v2_invalid, \ -+ .val_to_text = bch2_btree_ptr_v2_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .compat = bch2_btree_ptr_v2_compat, \ -+ .trans_trigger = bch2_trans_mark_extent, \ -+ .atomic_trigger = bch2_mark_extent, \ -+ .min_val_size = 40, \ -+}) -+ -+/* KEY_TYPE_extent: */ -+ -+bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -+ -+#define bch2_bkey_ops_extent ((struct bkey_ops) { \ -+ .key_invalid = bch2_bkey_ptrs_invalid, \ -+ .val_to_text = bch2_bkey_ptrs_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .key_normalize = bch2_extent_normalize, \ -+ .key_merge = bch2_extent_merge, \ -+ .trans_trigger = bch2_trans_mark_extent, \ -+ .atomic_trigger = bch2_mark_extent, \ -+}) -+ -+/* KEY_TYPE_reservation: */ -+ -+int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -+ -+#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ -+ .key_invalid = bch2_reservation_invalid, \ -+ .val_to_text = bch2_reservation_to_text, \ -+ .key_merge = bch2_reservation_merge, \ -+ .trans_trigger = bch2_trans_mark_reservation, \ -+ .atomic_trigger = bch2_mark_reservation, \ -+ .min_val_size = 8, \ -+}) -+ -+/* Extent checksum entries: */ -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c, -+ struct bch_extent_crc_unpacked); -+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -+void bch2_extent_crc_append(struct bkey_i *, -+ struct bch_extent_crc_unpacked); -+ -+/* Generic code for keys with pointers: */ -+ -+static inline bool bkey_is_btree_ptr(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_direct_data(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_inline_data(const struct bkey *k) -+{ -+ return k->type == KEY_TYPE_inline_data || -+ k->type == KEY_TYPE_indirect_inline_data; -+} -+ -+static inline unsigned bkey_inline_data_offset(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_inline_data: -+ return sizeof(struct bch_inline_data); -+ case KEY_TYPE_indirect_inline_data: -+ return sizeof(struct bch_indirect_inline_data); -+ default: -+ BUG(); -+ } -+} -+ -+static inline unsigned bkey_inline_data_bytes(const struct bkey *k) -+{ -+ return bkey_val_bytes(k) - bkey_inline_data_offset(k); -+} -+ -+#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) -+ -+static inline bool bkey_extent_is_data(const struct bkey *k) -+{ -+ return bkey_extent_is_direct_data(k) || -+ bkey_extent_is_inline_data(k) || -+ k->type == KEY_TYPE_reflink_p; -+} -+ -+/* -+ * Should extent be counted under inode->i_sectors? -+ */ -+static inline bool bkey_extent_is_allocation(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reservation: -+ case KEY_TYPE_reflink_p: -+ case KEY_TYPE_reflink_v: -+ case KEY_TYPE_inline_data: -+ case KEY_TYPE_indirect_inline_data: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (ptr->unwritten) -+ return true; -+ return false; -+} -+ -+static inline bool bkey_extent_is_reservation(struct bkey_s_c k) -+{ -+ return k.k->type == KEY_TYPE_reservation || -+ bkey_extent_is_unwritten(k); -+} -+ -+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (!ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ return BCH_DATA_btree; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return BCH_DATA_user; -+ case KEY_TYPE_stripe: { -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ -+ BUG_ON(ptr < s.v->ptrs || -+ ptr >= s.v->ptrs + s.v->nr_blocks); -+ -+ return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant -+ ? BCH_DATA_parity -+ : BCH_DATA_user; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -+bool bch2_bkey_is_incompressible(struct bkey_s_c); -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -+ -+unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); -+unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); -+unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); -+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -+ -+void bch2_bkey_drop_device(struct bkey_s, unsigned); -+void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); -+ -+const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); -+ -+static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) -+{ -+ return (void *) bch2_bkey_has_device_c(k.s_c, dev); -+} -+ -+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); -+ -+void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); -+ -+static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) -+{ -+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); -+ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); -+ -+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ -+ memcpy((void *) &k->v + bkey_val_bytes(&k->k), -+ &ptr, -+ sizeof(ptr)); -+ k->k.u64s++; -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_extent_ptr_decoded_append(struct bkey_i *, -+ struct extent_ptr_decoded *); -+union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, -+ struct bch_extent_ptr *); -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, -+ struct bch_extent_ptr *); -+ -+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -+do { \ -+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ -+ \ -+ _ptr = &_ptrs.start->ptr; \ -+ \ -+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ -+ if (_cond) { \ -+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ -+ _ptrs = bch2_bkey_ptrs(_k); \ -+ continue; \ -+ } \ -+ \ -+ (_ptr)++; \ -+ } \ -+} while (0) -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, -+ struct bch_extent_ptr, u64); -+bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); -+struct bch_extent_ptr * -+bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); -+ -+void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); -+ -+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+ -+void bch2_ptr_swab(struct bkey_s); -+ -+/* Generic extent code: */ -+ -+enum bch_extent_overlap { -+ BCH_EXTENT_OVERLAP_ALL = 0, -+ BCH_EXTENT_OVERLAP_BACK = 1, -+ BCH_EXTENT_OVERLAP_FRONT = 2, -+ BCH_EXTENT_OVERLAP_MIDDLE = 3, -+}; -+ -+/* Returns how k overlaps with m */ -+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, -+ const struct bkey *m) -+{ -+ int cmp1 = bkey_lt(k->p, m->p); -+ int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); -+ -+ return (cmp1 << 1) + cmp2; -+} -+ -+int bch2_cut_front_s(struct bpos, struct bkey_s); -+int bch2_cut_back_s(struct bpos, struct bkey_s); -+ -+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_front_s(where, bkey_i_to_s(k)); -+} -+ -+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_back_s(where, bkey_i_to_s(k)); -+} -+ -+/** -+ * bch_key_resize - adjust size of @k -+ * -+ * bkey_start_offset(k) will be preserved, modifies where the extent ends -+ */ -+static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -+{ -+ k->p.offset -= k->size; -+ k->p.offset += new_size; -+ k->size = new_size; -+} -+ -+/* -+ * In extent_sort_fix_overlapping(), insert_fixup_extent(), -+ * extent_merge_inline() - we're modifying keys in place that are packed. To do -+ * that we have to unpack the key, modify the unpacked key - then this -+ * copies/repacks the unpacked to the original as necessary. -+ */ -+static inline void extent_save(struct btree *b, struct bkey_packed *dst, -+ struct bkey *src) -+{ -+ struct bkey_format *f = &b->format; -+ struct bkey_i *dst_unpacked; -+ -+ if ((dst_unpacked = packed_to_bkey(dst))) -+ dst_unpacked->k = *src; -+ else -+ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -+} -+ -+#endif /* _BCACHEFS_EXTENTS_H */ -diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h -new file mode 100644 -index 000000000..43d6c341e ---- /dev/null -+++ b/fs/bcachefs/extents_types.h -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_TYPES_H -+#define _BCACHEFS_EXTENTS_TYPES_H -+ -+#include "bcachefs_format.h" -+ -+struct bch_extent_crc_unpacked { -+ u32 compressed_size; -+ u32 uncompressed_size; -+ u32 live_size; -+ -+ u8 csum_type; -+ u8 compression_type; -+ -+ u16 offset; -+ -+ u16 nonce; -+ -+ struct bch_csum csum; -+}; -+ -+struct extent_ptr_decoded { -+ unsigned idx; -+ bool has_ec; -+ struct bch_extent_crc_unpacked crc; -+ struct bch_extent_ptr ptr; -+ struct bch_extent_stripe_ptr ec; -+}; -+ -+struct bch_io_failures { -+ u8 nr; -+ struct bch_dev_io_failures { -+ u8 dev; -+ u8 idx; -+ u8 nr_failed; -+ u8 nr_retries; -+ } devs[BCH_REPLICAS_MAX]; -+}; -+ -+#endif /* _BCACHEFS_EXTENTS_TYPES_H */ -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -new file mode 100644 -index 000000000..05429c963 ---- /dev/null -+++ b/fs/bcachefs/eytzinger.h -@@ -0,0 +1,281 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _EYTZINGER_H -+#define _EYTZINGER_H -+ -+#include -+#include -+ -+#include "util.h" -+ -+/* -+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an -+ * array -+ */ -+ -+/* -+ * One based indexing version: -+ * -+ * With one based indexing each level of the tree starts at a power of two - -+ * good for cacheline alignment: -+ */ -+ -+static inline unsigned eytzinger1_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + child; -+} -+ -+static inline unsigned eytzinger1_left_child(unsigned i) -+{ -+ return eytzinger1_child(i, 0); -+} -+ -+static inline unsigned eytzinger1_right_child(unsigned i) -+{ -+ return eytzinger1_child(i, 1); -+} -+ -+static inline unsigned eytzinger1_first(unsigned size) -+{ -+ return rounddown_pow_of_two(size); -+} -+ -+static inline unsigned eytzinger1_last(unsigned size) -+{ -+ return rounddown_pow_of_two(size + 1) - 1; -+} -+ -+/* -+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that -+ * -+ * eytzinger1_next(0) == eytzinger1_first()) -+ * eytzinger1_prev(0) == eytzinger1_last()) -+ * -+ * eytzinger1_prev(eytzinger1_first()) == 0 -+ * eytzinger1_next(eytzinger1_last()) == 0 -+ */ -+ -+static inline unsigned eytzinger1_next(unsigned i, unsigned size) -+{ -+ EBUG_ON(i > size); -+ -+ if (eytzinger1_right_child(i) <= size) { -+ i = eytzinger1_right_child(i); -+ -+ i <<= __fls(size + 1) - __fls(i); -+ i >>= i > size; -+ } else { -+ i >>= ffz(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -+{ -+ EBUG_ON(i > size); -+ -+ if (eytzinger1_left_child(i) <= size) { -+ i = eytzinger1_left_child(i) + 1; -+ -+ i <<= __fls(size + 1) - __fls(i); -+ i -= 1; -+ i >>= i > size; -+ } else { -+ i >>= __ffs(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_extra(unsigned size) -+{ -+ return (size + 1 - rounddown_pow_of_two(size)) << 1; -+} -+ -+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned b = __fls(i); -+ unsigned shift = __fls(size) - b; -+ int s; -+ -+ EBUG_ON(!i || i > size); -+ -+ i ^= 1U << b; -+ i <<= 1; -+ i |= 1; -+ i <<= shift; -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i -= (i - extra) >> 1; -+ */ -+ s = extra - i; -+ i += (s >> 1) & (s >> 31); -+ -+ return i; -+} -+ -+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned shift; -+ int s; -+ -+ EBUG_ON(!i || i > size); -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i += i - extra; -+ */ -+ s = extra - i; -+ i -= s & (s >> 31); -+ -+ shift = __ffs(i); -+ -+ i >>= shift + 1; -+ i |= 1U << (__fls(size) - shift); -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -+} -+ -+#define eytzinger1_for_each(_i, _size) \ -+ for ((_i) = eytzinger1_first((_size)); \ -+ (_i) != 0; \ -+ (_i) = eytzinger1_next((_i), (_size))) -+ -+/* Zero based indexing version: */ -+ -+static inline unsigned eytzinger0_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + 1 + child; -+} -+ -+static inline unsigned eytzinger0_left_child(unsigned i) -+{ -+ return eytzinger0_child(i, 0); -+} -+ -+static inline unsigned eytzinger0_right_child(unsigned i) -+{ -+ return eytzinger0_child(i, 1); -+} -+ -+static inline unsigned eytzinger0_first(unsigned size) -+{ -+ return eytzinger1_first(size) - 1; -+} -+ -+static inline unsigned eytzinger0_last(unsigned size) -+{ -+ return eytzinger1_last(size) - 1; -+} -+ -+static inline unsigned eytzinger0_next(unsigned i, unsigned size) -+{ -+ return eytzinger1_next(i + 1, size) - 1; -+} -+ -+static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -+{ -+ return eytzinger1_prev(i + 1, size) - 1; -+} -+ -+static inline unsigned eytzinger0_extra(unsigned size) -+{ -+ return eytzinger1_extra(size); -+} -+ -+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __eytzinger1_to_inorder(i + 1, size, extra) - 1; -+} -+ -+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __inorder_to_eytzinger1(i + 1, size, extra) - 1; -+} -+ -+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -+} -+ -+#define eytzinger0_for_each(_i, _size) \ -+ for ((_i) = eytzinger0_first((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_next((_i), (_size))) -+ -+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); -+ -+/* return greatest node <= @search, or -1 if not found */ -+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, -+ eytzinger_cmp_fn cmp, const void *search) -+{ -+ unsigned i, n = 0; -+ -+ if (!nr) -+ return -1; -+ -+ do { -+ i = n; -+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); -+ } while (n < nr); -+ -+ if (n & 1) { -+ /* @i was greater than @search, return previous node: */ -+ -+ if (i == eytzinger0_first(nr)) -+ return -1; -+ -+ return eytzinger0_prev(i, nr); -+ } else { -+ return i; -+ } -+} -+ -+#define eytzinger0_find(base, nr, size, _cmp, search) \ -+({ \ -+ void *_base = (base); \ -+ void *_search = (search); \ -+ size_t _nr = (nr); \ -+ size_t _size = (size); \ -+ size_t _i = 0; \ -+ int _res; \ -+ \ -+ while (_i < _nr && \ -+ (_res = _cmp(_search, _base + _i * _size, _size))) \ -+ _i = eytzinger0_child(_i, _res > 0); \ -+ _i; \ -+}) -+ -+void eytzinger0_sort(void *, size_t, size_t, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+#endif /* _EYTZINGER_H */ -diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h -new file mode 100644 -index 000000000..66b945be1 ---- /dev/null -+++ b/fs/bcachefs/fifo.h -@@ -0,0 +1,127 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FIFO_H -+#define _BCACHEFS_FIFO_H -+ -+#include "util.h" -+ -+#define FIFO(type) \ -+struct { \ -+ size_t front, back, size, mask; \ -+ type *data; \ -+} -+ -+#define DECLARE_FIFO(type, name) FIFO(type) name -+ -+#define fifo_buf_size(fifo) \ -+ ((fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ -+ : 0) -+ -+#define init_fifo(fifo, _size, _gfp) \ -+({ \ -+ (fifo)->front = (fifo)->back = 0; \ -+ (fifo)->size = (_size); \ -+ (fifo)->mask = (fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) - 1 \ -+ : 0; \ -+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ -+}) -+ -+#define free_fifo(fifo) \ -+do { \ -+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ -+ (fifo)->data = NULL; \ -+} while (0) -+ -+#define fifo_swap(l, r) \ -+do { \ -+ swap((l)->front, (r)->front); \ -+ swap((l)->back, (r)->back); \ -+ swap((l)->size, (r)->size); \ -+ swap((l)->mask, (r)->mask); \ -+ swap((l)->data, (r)->data); \ -+} while (0) -+ -+#define fifo_move(dest, src) \ -+do { \ -+ typeof(*((dest)->data)) _t; \ -+ while (!fifo_full(dest) && \ -+ fifo_pop(src, _t)) \ -+ fifo_push(dest, _t); \ -+} while (0) -+ -+#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) -+ -+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) -+ -+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) -+ -+#define fifo_entry_idx_abs(fifo, p) \ -+ ((((p) >= &fifo_peek_front(fifo) \ -+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ -+ (((p) - (fifo)->data))) -+ -+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -+#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) -+ -+#define fifo_push_back_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) -+ -+#define fifo_push_front_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) -+ -+#define fifo_push_back(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_push_front(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_pop_front(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_pop_back(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -+#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -+#define fifo_peek(fifo) fifo_peek_front(fifo) -+ -+#define fifo_for_each_entry(_entry, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#endif /* _BCACHEFS_FIFO_H */ -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -new file mode 100644 -index 000000000..bb5305441 ---- /dev/null -+++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,501 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "fs-common.h" -+#include "inode.h" -+#include "subvolume.h" -+#include "xattr.h" -+ -+#include -+ -+static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) -+{ -+ return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; -+} -+ -+int bch2_create_trans(struct btree_trans *trans, -+ subvol_inum dir, -+ struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *new_inode, -+ const struct qstr *name, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct posix_acl *default_acl, -+ struct posix_acl *acl, -+ subvol_inum snapshot_src, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter dir_iter = { NULL }; -+ struct btree_iter inode_iter = { NULL }; -+ subvol_inum new_inum = dir; -+ u64 now = bch2_current_time(c); -+ u64 cpu = raw_smp_processor_id(); -+ u64 dir_target; -+ u32 snapshot; -+ unsigned dir_type = mode_to_type(mode); -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ if (!(flags & BCH_CREATE_SNAPSHOT)) { -+ /* Normal create path - allocate a new inode: */ -+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); -+ -+ if (flags & BCH_CREATE_TMPFILE) -+ new_inode->bi_flags |= BCH_INODE_UNLINKED; -+ -+ ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); -+ if (ret) -+ goto err; -+ -+ snapshot_src = (subvol_inum) { 0 }; -+ } else { -+ /* -+ * Creating a snapshot - we're not allocating a new inode, but -+ * we do have to lookup the root inode of the subvolume we're -+ * snapshotting and update it (in the new snapshot): -+ */ -+ -+ if (!snapshot_src.inum) { -+ /* Inode wasn't specified, just snapshot: */ -+ struct bch_subvolume s; -+ -+ ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, -+ BTREE_ITER_CACHED, &s); -+ if (ret) -+ goto err; -+ -+ snapshot_src.inum = le64_to_cpu(s.inode); -+ } -+ -+ ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ if (new_inode->bi_subvol != snapshot_src.subvol) { -+ /* Not a subvolume root: */ -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ /* -+ * If we're not root, we have to own the subvolume being -+ * snapshotted: -+ */ -+ if (uid && new_inode->bi_uid != uid) { -+ ret = -EPERM; -+ goto err; -+ } -+ -+ flags |= BCH_CREATE_SUBVOL; -+ } -+ -+ new_inum.inum = new_inode->bi_inum; -+ dir_target = new_inode->bi_inum; -+ -+ if (flags & BCH_CREATE_SUBVOL) { -+ u32 new_subvol, dir_snapshot; -+ -+ ret = bch2_subvolume_create(trans, new_inode->bi_inum, -+ snapshot_src.subvol, -+ &new_subvol, &snapshot, -+ (flags & BCH_CREATE_SNAPSHOT_RO) != 0); -+ if (ret) -+ goto err; -+ -+ new_inode->bi_parent_subvol = dir.subvol; -+ new_inode->bi_subvol = new_subvol; -+ new_inum.subvol = new_subvol; -+ dir_target = new_subvol; -+ dir_type = DT_SUBVOL; -+ -+ ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); -+ ret = bch2_btree_iter_traverse(&dir_iter); -+ if (ret) -+ goto err; -+ } -+ -+ if (!(flags & BCH_CREATE_SNAPSHOT)) { -+ if (default_acl) { -+ ret = bch2_set_acl_trans(trans, new_inum, new_inode, -+ default_acl, ACL_TYPE_DEFAULT); -+ if (ret) -+ goto err; -+ } -+ -+ if (acl) { -+ ret = bch2_set_acl_trans(trans, new_inum, new_inode, -+ acl, ACL_TYPE_ACCESS); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ if (!(flags & BCH_CREATE_TMPFILE)) { -+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); -+ u64 dir_offset; -+ -+ if (is_subdir_for_nlink(new_inode)) -+ dir_u->bi_nlink++; -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ ret = bch2_inode_write(trans, &dir_iter, dir_u); -+ if (ret) -+ goto err; -+ -+ ret = bch2_dirent_create(trans, dir, &dir_hash, -+ dir_type, -+ name, -+ dir_target, -+ &dir_offset, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ goto err; -+ -+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { -+ new_inode->bi_dir = dir_u->bi_inum; -+ new_inode->bi_dir_offset = dir_offset; -+ } -+ } -+ -+ inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; -+ bch2_btree_iter_set_snapshot(&inode_iter, snapshot); -+ -+ ret = bch2_btree_iter_traverse(&inode_iter) ?: -+ bch2_inode_write(trans, &inode_iter, new_inode); -+err: -+ bch2_trans_iter_exit(trans, &inode_iter); -+ bch2_trans_iter_exit(trans, &dir_iter); -+ return ret; -+} -+ -+int bch2_link_trans(struct btree_trans *trans, -+ subvol_inum dir, struct bch_inode_unpacked *dir_u, -+ subvol_inum inum, struct bch_inode_unpacked *inode_u, -+ const struct qstr *name) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter dir_iter = { NULL }; -+ struct btree_iter inode_iter = { NULL }; -+ struct bch_hash_info dir_hash; -+ u64 now = bch2_current_time(c); -+ u64 dir_offset = 0; -+ int ret; -+ -+ if (dir.subvol != inum.subvol) -+ return -EXDEV; -+ -+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ inode_u->bi_ctime = now; -+ ret = bch2_inode_nlink_inc(inode_u); -+ if (ret) -+ return ret; -+ -+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ if (bch2_reinherit_attrs(inode_u, dir_u)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ dir_hash = bch2_hash_info_init(c, dir_u); -+ -+ ret = bch2_dirent_create(trans, dir, &dir_hash, -+ mode_to_type(inode_u->bi_mode), -+ name, inum.inum, &dir_offset, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ goto err; -+ -+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { -+ inode_u->bi_dir = dir.inum; -+ inode_u->bi_dir_offset = dir_offset; -+ } -+ -+ ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: -+ bch2_inode_write(trans, &inode_iter, inode_u); -+err: -+ bch2_trans_iter_exit(trans, &dir_iter); -+ bch2_trans_iter_exit(trans, &inode_iter); -+ return ret; -+} -+ -+int bch2_unlink_trans(struct btree_trans *trans, -+ subvol_inum dir, -+ struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, -+ const struct qstr *name, -+ bool deleting_snapshot) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter dir_iter = { NULL }; -+ struct btree_iter dirent_iter = { NULL }; -+ struct btree_iter inode_iter = { NULL }; -+ struct bch_hash_info dir_hash; -+ subvol_inum inum; -+ u64 now = bch2_current_time(c); -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ dir_hash = bch2_hash_info_init(c, dir_u); -+ -+ ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, -+ name, &inum, BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { -+ ret = bch2_empty_dir_trans(trans, inum); -+ if (ret) -+ goto err; -+ } -+ -+ if (deleting_snapshot && !inode_u->bi_subvol) { -+ ret = -BCH_ERR_ENOENT_not_subvol; -+ goto err; -+ } -+ -+ if (deleting_snapshot || inode_u->bi_subvol) { -+ ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(&dirent_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ /* -+ * If we're deleting a subvolume, we need to really delete the -+ * dirent, not just emit a whiteout in the current snapshot: -+ */ -+ bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); -+ ret = bch2_btree_iter_traverse(&dirent_iter); -+ if (ret) -+ goto err; -+ } else { -+ bch2_inode_nlink_dec(trans, inode_u); -+ } -+ -+ if (inode_u->bi_dir == dirent_iter.pos.inode && -+ inode_u->bi_dir_offset == dirent_iter.pos.offset) { -+ inode_u->bi_dir = 0; -+ inode_u->bi_dir_offset = 0; -+ } -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; -+ dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); -+ -+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ &dir_hash, &dirent_iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_inode_write(trans, &dir_iter, dir_u) ?: -+ bch2_inode_write(trans, &inode_iter, inode_u); -+err: -+ bch2_trans_iter_exit(trans, &inode_iter); -+ bch2_trans_iter_exit(trans, &dirent_iter); -+ bch2_trans_iter_exit(trans, &dir_iter); -+ return ret; -+} -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, -+ struct bch_inode_unpacked *src_u) -+{ -+ u64 src, dst; -+ unsigned id; -+ bool ret = false; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ /* Skip attributes that were explicitly set on this inode */ -+ if (dst_u->bi_fields_set & (1 << id)) -+ continue; -+ -+ src = bch2_inode_opt_get(src_u, id); -+ dst = bch2_inode_opt_get(dst_u, id); -+ -+ if (src == dst) -+ continue; -+ -+ bch2_inode_opt_set(dst_u, id, src); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+int bch2_rename_trans(struct btree_trans *trans, -+ subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, -+ subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, -+ struct bch_inode_unpacked *src_inode_u, -+ struct bch_inode_unpacked *dst_inode_u, -+ const struct qstr *src_name, -+ const struct qstr *dst_name, -+ enum bch_rename_mode mode) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter src_dir_iter = { NULL }; -+ struct btree_iter dst_dir_iter = { NULL }; -+ struct btree_iter src_inode_iter = { NULL }; -+ struct btree_iter dst_inode_iter = { NULL }; -+ struct bch_hash_info src_hash, dst_hash; -+ subvol_inum src_inum, dst_inum; -+ u64 src_offset, dst_offset; -+ u64 now = bch2_current_time(c); -+ int ret; -+ -+ ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ src_hash = bch2_hash_info_init(c, src_dir_u); -+ -+ if (dst_dir.inum != src_dir.inum || -+ dst_dir.subvol != src_dir.subvol) { -+ ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ dst_hash = bch2_hash_info_init(c, dst_dir_u); -+ } else { -+ dst_dir_u = src_dir_u; -+ dst_hash = src_hash; -+ } -+ -+ ret = bch2_dirent_rename(trans, -+ src_dir, &src_hash, -+ dst_dir, &dst_hash, -+ src_name, &src_inum, &src_offset, -+ dst_name, &dst_inum, &dst_offset, -+ mode); -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ -+ if (dst_inum.inum) { -+ ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto err; -+ } -+ -+ if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { -+ src_inode_u->bi_dir = dst_dir_u->bi_inum; -+ src_inode_u->bi_dir_offset = dst_offset; -+ -+ if (mode == BCH_RENAME_EXCHANGE) { -+ dst_inode_u->bi_dir = src_dir_u->bi_inum; -+ dst_inode_u->bi_dir_offset = src_offset; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE && -+ dst_inode_u->bi_dir == dst_dir_u->bi_inum && -+ dst_inode_u->bi_dir_offset == src_offset) { -+ dst_inode_u->bi_dir = 0; -+ dst_inode_u->bi_dir_offset = 0; -+ } -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ if (S_ISDIR(src_inode_u->bi_mode) != -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -ENOTDIR; -+ goto err; -+ } -+ -+ if (S_ISDIR(dst_inode_u->bi_mode) && -+ bch2_empty_dir_trans(trans, dst_inum)) { -+ ret = -ENOTEMPTY; -+ goto err; -+ } -+ } -+ -+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -+ S_ISDIR(src_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (is_subdir_for_nlink(src_inode_u)) { -+ src_dir_u->bi_nlink--; -+ dst_dir_u->bi_nlink++; -+ } -+ -+ if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { -+ dst_dir_u->bi_nlink--; -+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) -+ bch2_inode_nlink_dec(trans, dst_inode_u); -+ -+ src_dir_u->bi_mtime = now; -+ src_dir_u->bi_ctime = now; -+ -+ if (src_dir.inum != dst_dir.inum) { -+ dst_dir_u->bi_mtime = now; -+ dst_dir_u->bi_ctime = now; -+ } -+ -+ src_inode_u->bi_ctime = now; -+ -+ if (dst_inum.inum) -+ dst_inode_u->bi_ctime = now; -+ -+ ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: -+ (src_dir.inum != dst_dir.inum -+ ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) -+ : 0) ?: -+ bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: -+ (dst_inum.inum -+ ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) -+ : 0); -+err: -+ bch2_trans_iter_exit(trans, &dst_inode_iter); -+ bch2_trans_iter_exit(trans, &src_inode_iter); -+ bch2_trans_iter_exit(trans, &dst_dir_iter); -+ bch2_trans_iter_exit(trans, &src_dir_iter); -+ return ret; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h -new file mode 100644 -index 000000000..dde237859 ---- /dev/null -+++ b/fs/bcachefs/fs-common.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_COMMON_H -+#define _BCACHEFS_FS_COMMON_H -+ -+struct posix_acl; -+ -+#define BCH_CREATE_TMPFILE (1U << 0) -+#define BCH_CREATE_SUBVOL (1U << 1) -+#define BCH_CREATE_SNAPSHOT (1U << 2) -+#define BCH_CREATE_SNAPSHOT_RO (1U << 3) -+ -+int bch2_create_trans(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct posix_acl *, -+ struct posix_acl *, -+ subvol_inum, unsigned); -+ -+int bch2_link_trans(struct btree_trans *, -+ subvol_inum, struct bch_inode_unpacked *, -+ subvol_inum, struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_unlink_trans(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, bool); -+ -+int bch2_rename_trans(struct btree_trans *, -+ subvol_inum, struct bch_inode_unpacked *, -+ subvol_inum, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ const struct qstr *, -+ enum bch_rename_mode); -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *); -+ -+#endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c -new file mode 100644 -index 000000000..dc22182d5 ---- /dev/null -+++ b/fs/bcachefs/fs-io-buffered.c -@@ -0,0 +1,1099 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_buf.h" -+#include "fs-io.h" -+#include "fs-io-buffered.h" -+#include "fs-io-direct.h" -+#include "fs-io-pagecache.h" -+#include "io.h" -+ -+#include -+#include -+#include -+ -+static inline bool bio_full(struct bio *bio, unsigned len) -+{ -+ if (bio->bi_vcnt >= bio->bi_max_vecs) -+ return true; -+ if (bio->bi_iter.bi_size > UINT_MAX - len) -+ return true; -+ return false; -+} -+ -+/* readpage(s): */ -+ -+static void bch2_readpages_end_io(struct bio *bio) -+{ -+ struct folio_iter fi; -+ -+ bio_for_each_folio_all(fi, bio) { -+ if (!bio->bi_status) { -+ folio_mark_uptodate(fi.folio); -+ } else { -+ folio_clear_uptodate(fi.folio); -+ folio_set_error(fi.folio); -+ } -+ folio_unlock(fi.folio); -+ } -+ -+ bio_put(bio); -+} -+ -+struct readpages_iter { -+ struct address_space *mapping; -+ unsigned idx; -+ folios folios; -+}; -+ -+static int readpages_iter_init(struct readpages_iter *iter, -+ struct readahead_control *ractl) -+{ -+ struct folio **fi; -+ int ret; -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->mapping = ractl->mapping; -+ -+ ret = bch2_filemap_get_contig_folios_d(iter->mapping, -+ ractl->_index << PAGE_SHIFT, -+ (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, -+ 0, mapping_gfp_mask(iter->mapping), -+ &iter->folios); -+ if (ret) -+ return ret; -+ -+ darray_for_each(iter->folios, fi) { -+ ractl->_nr_pages -= 1U << folio_order(*fi); -+ __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); -+ folio_put(*fi); -+ folio_put(*fi); -+ } -+ -+ return 0; -+} -+ -+static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) -+{ -+ if (iter->idx >= iter->folios.nr) -+ return NULL; -+ return iter->folios.data[iter->idx]; -+} -+ -+static inline void readpage_iter_advance(struct readpages_iter *iter) -+{ -+ iter->idx++; -+} -+ -+static bool extent_partial_reads_expensive(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (crc.csum_type || crc.compression_type) -+ return true; -+ return false; -+} -+ -+static int readpage_bio_extend(struct btree_trans *trans, -+ struct readpages_iter *iter, -+ struct bio *bio, -+ unsigned sectors_this_extent, -+ bool get_more) -+{ -+ /* Don't hold btree locks while allocating memory: */ -+ bch2_trans_unlock(trans); -+ -+ while (bio_sectors(bio) < sectors_this_extent && -+ bio->bi_vcnt < bio->bi_max_vecs) { -+ struct folio *folio = readpage_iter_peek(iter); -+ int ret; -+ -+ if (folio) { -+ readpage_iter_advance(iter); -+ } else { -+ pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; -+ -+ if (!get_more) -+ break; -+ -+ folio = xa_load(&iter->mapping->i_pages, folio_offset); -+ if (folio && !xa_is_value(folio)) -+ break; -+ -+ folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); -+ if (!folio) -+ break; -+ -+ if (!__bch2_folio_create(folio, GFP_KERNEL)) { -+ folio_put(folio); -+ break; -+ } -+ -+ ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); -+ if (ret) { -+ __bch2_folio_release(folio); -+ folio_put(folio); -+ break; -+ } -+ -+ folio_put(folio); -+ } -+ -+ BUG_ON(folio_sector(folio) != bio_end_sector(bio)); -+ -+ BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); -+ } -+ -+ return bch2_trans_relock(trans); -+} -+ -+static void bchfs_read(struct btree_trans *trans, -+ struct bch_read_bio *rbio, -+ subvol_inum inum, -+ struct readpages_iter *readpages_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_buf sk; -+ int flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE; -+ u32 snapshot; -+ int ret = 0; -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ rbio->subvol = inum.subvol; -+ -+ bch2_bkey_buf_init(&sk); -+retry: -+ bch2_trans_begin(trans); -+ iter = (struct btree_iter) { NULL }; -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, -+ SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ struct bkey_s_c k; -+ unsigned bytes, sectors, offset_into_extent; -+ enum btree_id data_btree = BTREE_ID_extents; -+ -+ /* -+ * read_extent -> io_time_reset may cause a transaction restart -+ * without returning an error, we need to check for that here: -+ */ -+ ret = bch2_trans_relock(trans); -+ if (ret) -+ break; -+ -+ bch2_btree_iter_set_pos(&iter, -+ POS(inum.inum, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ offset_into_extent = iter.pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ -+ ret = bch2_read_indirect_extent(trans, &data_btree, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ k = bkey_i_to_s_c(sk.k); -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ if (readpages_iter) { -+ ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, -+ extent_partial_reads_expensive(k)); -+ if (ret) -+ break; -+ } -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ bch2_bio_page_state_set(&rbio->bio, k); -+ -+ bch2_read_extent(trans, rbio, iter.pos, -+ data_btree, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ -+ ret = btree_trans_too_many_iters(trans); -+ if (ret) -+ break; -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ if (ret) { -+ bch_err_inum_offset_ratelimited(c, -+ iter.pos.inode, -+ iter.pos.offset << 9, -+ "read error %i from btree lookup", ret); -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ bio_endio(&rbio->bio); -+ } -+ -+ bch2_bkey_buf_exit(&sk, c); -+} -+ -+void bch2_readahead(struct readahead_control *ractl) -+{ -+ struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts; -+ struct btree_trans trans; -+ struct folio *folio; -+ struct readpages_iter readpages_iter; -+ int ret; -+ -+ bch2_inode_opts_get(&opts, c, &inode->ei_inode); -+ -+ ret = readpages_iter_init(&readpages_iter, ractl); -+ BUG_ON(ret); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_pagecache_add_get(inode); -+ -+ while ((folio = readpage_iter_peek(&readpages_iter))) { -+ unsigned n = min_t(unsigned, -+ readpages_iter.folios.nr - -+ readpages_iter.idx, -+ BIO_MAX_VECS); -+ struct bch_read_bio *rbio = -+ rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, -+ GFP_KERNEL, &c->bio_read), -+ opts); -+ -+ readpage_iter_advance(&readpages_iter); -+ -+ rbio->bio.bi_iter.bi_sector = folio_sector(folio); -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); -+ -+ bchfs_read(&trans, rbio, inode_inum(inode), -+ &readpages_iter); -+ bch2_trans_unlock(&trans); -+ } -+ -+ bch2_pagecache_add_put(inode); -+ -+ bch2_trans_exit(&trans); -+ darray_exit(&readpages_iter.folios); -+} -+ -+static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, -+ subvol_inum inum, struct folio *folio) -+{ -+ struct btree_trans trans; -+ -+ bch2_folio_create(folio, __GFP_NOFAIL); -+ -+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; -+ rbio->bio.bi_iter.bi_sector = folio_sector(folio); -+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bchfs_read(&trans, rbio, inum, NULL); -+ bch2_trans_exit(&trans); -+} -+ -+static void bch2_read_single_folio_end_io(struct bio *bio) -+{ -+ complete(bio->bi_private); -+} -+ -+int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_read_bio *rbio; -+ struct bch_io_opts opts; -+ int ret; -+ DECLARE_COMPLETION_ONSTACK(done); -+ -+ bch2_inode_opts_get(&opts, c, &inode->ei_inode); -+ -+ rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), -+ opts); -+ rbio->bio.bi_private = &done; -+ rbio->bio.bi_end_io = bch2_read_single_folio_end_io; -+ -+ __bchfs_readfolio(c, rbio, inode_inum(inode), folio); -+ wait_for_completion(&done); -+ -+ ret = blk_status_to_errno(rbio->bio.bi_status); -+ bio_put(&rbio->bio); -+ -+ if (ret < 0) -+ return ret; -+ -+ folio_mark_uptodate(folio); -+ return 0; -+} -+ -+int bch2_read_folio(struct file *file, struct folio *folio) -+{ -+ int ret; -+ -+ ret = bch2_read_single_folio(folio, folio->mapping); -+ folio_unlock(folio); -+ return bch2_err_class(ret); -+} -+ -+/* writepages: */ -+ -+struct bch_writepage_io { -+ struct bch_inode_info *inode; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct bch_writepage_state { -+ struct bch_writepage_io *io; -+ struct bch_io_opts opts; -+ struct bch_folio_sector *tmp; -+ unsigned tmp_sectors; -+}; -+ -+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ struct bch_writepage_state ret = { 0 }; -+ -+ bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); -+ return ret; -+} -+ -+static void bch2_writepage_io_done(struct bch_write_op *op) -+{ -+ struct bch_writepage_io *io = -+ container_of(op, struct bch_writepage_io, op); -+ struct bch_fs *c = io->op.c; -+ struct bio *bio = &io->op.wbio.bio; -+ struct folio_iter fi; -+ unsigned i; -+ -+ if (io->op.error) { -+ set_bit(EI_INODE_ERROR, &io->inode->ei_flags); -+ -+ bio_for_each_folio_all(fi, bio) { -+ struct bch_folio *s; -+ -+ folio_set_error(fi.folio); -+ mapping_set_error(fi.folio->mapping, -EIO); -+ -+ s = __bch2_folio(fi.folio); -+ spin_lock(&s->lock); -+ for (i = 0; i < folio_sectors(fi.folio); i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ bio_for_each_folio_all(fi, bio) { -+ struct bch_folio *s; -+ -+ s = __bch2_folio(fi.folio); -+ spin_lock(&s->lock); -+ for (i = 0; i < folio_sectors(fi.folio); i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ /* -+ * racing with fallocate can cause us to add fewer sectors than -+ * expected - but we shouldn't add more sectors than expected: -+ */ -+ WARN_ON_ONCE(io->op.i_sectors_delta > 0); -+ -+ /* -+ * (error (due to going RO) halfway through a page can screw that up -+ * slightly) -+ * XXX wtf? -+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); -+ */ -+ -+ /* -+ * PageWriteback is effectively our ref on the inode - fixup i_blocks -+ * before calling end_page_writeback: -+ */ -+ bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); -+ -+ bio_for_each_folio_all(fi, bio) { -+ struct bch_folio *s = __bch2_folio(fi.folio); -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ folio_end_writeback(fi.folio); -+ } -+ -+ bio_put(&io->op.wbio.bio); -+} -+ -+static void bch2_writepage_do_io(struct bch_writepage_state *w) -+{ -+ struct bch_writepage_io *io = w->io; -+ -+ w->io = NULL; -+ closure_call(&io->op.cl, bch2_write, NULL, NULL); -+} -+ -+/* -+ * Get a bch_writepage_io and add @page to it - appending to an existing one if -+ * possible, else allocating a new one: -+ */ -+static void bch2_writepage_io_alloc(struct bch_fs *c, -+ struct writeback_control *wbc, -+ struct bch_writepage_state *w, -+ struct bch_inode_info *inode, -+ u64 sector, -+ unsigned nr_replicas) -+{ -+ struct bch_write_op *op; -+ -+ w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, -+ REQ_OP_WRITE, -+ GFP_KERNEL, -+ &c->writepage_bioset), -+ struct bch_writepage_io, op.wbio.bio); -+ -+ w->io->inode = inode; -+ op = &w->io->op; -+ bch2_write_op_init(op, c, w->opts); -+ op->target = w->opts.foreground_target; -+ op->nr_replicas = nr_replicas; -+ op->res.nr_replicas = nr_replicas; -+ op->write_point = writepoint_hashed(inode->ei_last_dirtied); -+ op->subvol = inode->ei_subvol; -+ op->pos = POS(inode->v.i_ino, sector); -+ op->end_io = bch2_writepage_io_done; -+ op->devs_need_flush = &inode->ei_devs_need_flush; -+ op->wbio.bio.bi_iter.bi_sector = sector; -+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -+} -+ -+static int __bch2_writepage(struct folio *folio, -+ struct writeback_control *wbc, -+ void *data) -+{ -+ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_writepage_state *w = data; -+ struct bch_folio *s; -+ unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; -+ loff_t i_size = i_size_read(&inode->v); -+ int ret; -+ -+ EBUG_ON(!folio_test_uptodate(folio)); -+ -+ /* Is the folio fully inside i_size? */ -+ if (folio_end_pos(folio) <= i_size) -+ goto do_io; -+ -+ /* Is the folio fully outside i_size? (truncate in progress) */ -+ if (folio_pos(folio) >= i_size) { -+ folio_unlock(folio); -+ return 0; -+ } -+ -+ /* -+ * The folio straddles i_size. It must be zeroed out on each and every -+ * writepage invocation because it may be mmapped. "A file is mapped -+ * in multiples of the folio size. For a file that is not a multiple of -+ * the folio size, the remaining memory is zeroed when mapped, and -+ * writes to that region are not written out to the file." -+ */ -+ folio_zero_segment(folio, -+ i_size - folio_pos(folio), -+ folio_size(folio)); -+do_io: -+ f_sectors = folio_sectors(folio); -+ s = bch2_folio(folio); -+ -+ if (f_sectors > w->tmp_sectors) { -+ kfree(w->tmp); -+ w->tmp = kcalloc(f_sectors, sizeof(struct bch_folio_sector), __GFP_NOFAIL); -+ w->tmp_sectors = f_sectors; -+ } -+ -+ /* -+ * Things get really hairy with errors during writeback: -+ */ -+ ret = bch2_get_folio_disk_reservation(c, inode, folio, false); -+ BUG_ON(ret); -+ -+ /* Before unlocking the page, get copy of reservations: */ -+ spin_lock(&s->lock); -+ memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); -+ -+ for (i = 0; i < f_sectors; i++) { -+ if (s->s[i].state < SECTOR_dirty) -+ continue; -+ -+ nr_replicas_this_write = -+ min_t(unsigned, nr_replicas_this_write, -+ s->s[i].nr_replicas + -+ s->s[i].replicas_reserved); -+ } -+ -+ for (i = 0; i < f_sectors; i++) { -+ if (s->s[i].state < SECTOR_dirty) -+ continue; -+ -+ s->s[i].nr_replicas = w->opts.compression -+ ? 0 : nr_replicas_this_write; -+ -+ s->s[i].replicas_reserved = 0; -+ bch2_folio_sector_set(folio, s, i, SECTOR_allocated); -+ } -+ spin_unlock(&s->lock); -+ -+ BUG_ON(atomic_read(&s->write_count)); -+ atomic_set(&s->write_count, 1); -+ -+ BUG_ON(folio_test_writeback(folio)); -+ folio_start_writeback(folio); -+ -+ folio_unlock(folio); -+ -+ offset = 0; -+ while (1) { -+ unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; -+ u64 sector; -+ -+ while (offset < f_sectors && -+ w->tmp[offset].state < SECTOR_dirty) -+ offset++; -+ -+ if (offset == f_sectors) -+ break; -+ -+ while (offset + sectors < f_sectors && -+ w->tmp[offset + sectors].state >= SECTOR_dirty) { -+ reserved_sectors += w->tmp[offset + sectors].replicas_reserved; -+ dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; -+ sectors++; -+ } -+ BUG_ON(!sectors); -+ -+ sector = folio_sector(folio) + offset; -+ -+ if (w->io && -+ (w->io->op.res.nr_replicas != nr_replicas_this_write || -+ bio_full(&w->io->op.wbio.bio, sectors << 9) || -+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= -+ (BIO_MAX_VECS * PAGE_SIZE) || -+ bio_end_sector(&w->io->op.wbio.bio) != sector)) -+ bch2_writepage_do_io(w); -+ -+ if (!w->io) -+ bch2_writepage_io_alloc(c, wbc, w, inode, sector, -+ nr_replicas_this_write); -+ -+ atomic_inc(&s->write_count); -+ -+ BUG_ON(inode != w->io->inode); -+ BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, -+ sectors << 9, offset << 9)); -+ -+ /* Check for writing past i_size: */ -+ WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c)) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), -+ "writing past i_size: %llu > %llu (unrounded %llu)\n", -+ bio_end_sector(&w->io->op.wbio.bio) << 9, -+ round_up(i_size, block_bytes(c)), -+ i_size); -+ -+ w->io->op.res.sectors += reserved_sectors; -+ w->io->op.i_sectors_delta -= dirty_sectors; -+ w->io->op.new_i_size = i_size; -+ -+ offset += sectors; -+ } -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ folio_end_writeback(folio); -+ -+ return 0; -+} -+ -+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(mapping->host)); -+ struct blk_plug plug; -+ int ret; -+ -+ blk_start_plug(&plug); -+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ blk_finish_plug(&plug); -+ kfree(w.tmp); -+ return bch2_err_class(ret); -+} -+ -+/* buffered writes: */ -+ -+int bch2_write_begin(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, -+ struct page **pagep, void **fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_folio_reservation *res; -+ struct folio *folio; -+ unsigned offset; -+ int ret = -ENOMEM; -+ -+ res = kmalloc(sizeof(*res), GFP_KERNEL); -+ if (!res) -+ return -ENOMEM; -+ -+ bch2_folio_reservation_init(c, inode, res); -+ *fsdata = res; -+ -+ bch2_pagecache_add_get(inode); -+ -+ folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, -+ FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, -+ mapping_gfp_mask(mapping)); -+ if (IS_ERR_OR_NULL(folio)) -+ goto err_unlock; -+ -+ if (folio_test_uptodate(folio)) -+ goto out; -+ -+ offset = pos - folio_pos(folio); -+ len = min_t(size_t, len, folio_end_pos(folio) - pos); -+ -+ /* If we're writing entire folio, don't need to read it in first: */ -+ if (!offset && len == folio_size(folio)) -+ goto out; -+ -+ if (!offset && pos + len >= inode->v.i_size) { -+ folio_zero_segment(folio, len, folio_size(folio)); -+ flush_dcache_folio(folio); -+ goto out; -+ } -+ -+ if (folio_pos(folio) >= inode->v.i_size) { -+ folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); -+ flush_dcache_folio(folio); -+ goto out; -+ } -+readpage: -+ ret = bch2_read_single_folio(folio, mapping); -+ if (ret) -+ goto err; -+out: -+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); -+ if (ret) -+ goto err; -+ -+ ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); -+ if (ret) { -+ if (!folio_test_uptodate(folio)) { -+ /* -+ * If the folio hasn't been read in, we won't know if we -+ * actually need a reservation - we don't actually need -+ * to read here, we just need to check if the folio is -+ * fully backed by uncompressed data: -+ */ -+ goto readpage; -+ } -+ -+ goto err; -+ } -+ -+ *pagep = &folio->page; -+ return 0; -+err: -+ folio_unlock(folio); -+ folio_put(folio); -+ *pagep = NULL; -+err_unlock: -+ bch2_pagecache_add_put(inode); -+ kfree(res); -+ *fsdata = NULL; -+ return bch2_err_class(ret); -+} -+ -+int bch2_write_end(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_folio_reservation *res = fsdata; -+ struct folio *folio = page_folio(page); -+ unsigned offset = pos - folio_pos(folio); -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ BUG_ON(offset + copied > folio_size(folio)); -+ -+ if (unlikely(copied < len && !folio_test_uptodate(folio))) { -+ /* -+ * The folio needs to be read in, but that would destroy -+ * our partial write - simplest thing is to just force -+ * userspace to redo the write: -+ */ -+ folio_zero_range(folio, 0, folio_size(folio)); -+ flush_dcache_folio(folio); -+ copied = 0; -+ } -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ if (copied) { -+ if (!folio_test_uptodate(folio)) -+ folio_mark_uptodate(folio); -+ -+ bch2_set_folio_dirty(c, inode, folio, res, offset, copied); -+ -+ inode->ei_last_dirtied = (unsigned long) current; -+ } -+ -+ folio_unlock(folio); -+ folio_put(folio); -+ bch2_pagecache_add_put(inode); -+ -+ bch2_folio_reservation_put(c, inode, res); -+ kfree(res); -+ -+ return copied; -+} -+ -+static noinline void folios_trunc(folios *folios, struct folio **fi) -+{ -+ while (folios->data + folios->nr > fi) { -+ struct folio *f = darray_pop(folios); -+ -+ folio_unlock(f); -+ folio_put(f); -+ } -+} -+ -+static int __bch2_buffered_write(struct bch_inode_info *inode, -+ struct address_space *mapping, -+ struct iov_iter *iter, -+ loff_t pos, unsigned len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_folio_reservation res; -+ folios folios; -+ struct folio **fi, *f; -+ unsigned copied = 0, f_offset; -+ u64 end = pos + len, f_pos; -+ loff_t last_folio_pos = inode->v.i_size; -+ int ret = 0; -+ -+ BUG_ON(!len); -+ -+ bch2_folio_reservation_init(c, inode, &res); -+ darray_init(&folios); -+ -+ ret = bch2_filemap_get_contig_folios_d(mapping, pos, end, -+ FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, -+ mapping_gfp_mask(mapping), -+ &folios); -+ if (ret) -+ goto out; -+ -+ BUG_ON(!folios.nr); -+ -+ f = darray_first(folios); -+ if (pos != folio_pos(f) && !folio_test_uptodate(f)) { -+ ret = bch2_read_single_folio(f, mapping); -+ if (ret) -+ goto out; -+ } -+ -+ f = darray_last(folios); -+ end = min(end, folio_end_pos(f)); -+ last_folio_pos = folio_pos(f); -+ if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { -+ if (end >= inode->v.i_size) { -+ folio_zero_range(f, 0, folio_size(f)); -+ } else { -+ ret = bch2_read_single_folio(f, mapping); -+ if (ret) -+ goto out; -+ } -+ } -+ -+ ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); -+ if (ret) -+ goto out; -+ -+ f_pos = pos; -+ f_offset = pos - folio_pos(darray_first(folios)); -+ darray_for_each(folios, fi) { -+ struct folio *f = *fi; -+ u64 f_len = min(end, folio_end_pos(f)) - f_pos; -+ -+ /* -+ * XXX: per POSIX and fstests generic/275, on -ENOSPC we're -+ * supposed to write as much as we have disk space for. -+ * -+ * On failure here we should still write out a partial page if -+ * we aren't completely out of disk space - we don't do that -+ * yet: -+ */ -+ ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); -+ if (unlikely(ret)) { -+ folios_trunc(&folios, fi); -+ if (!folios.nr) -+ goto out; -+ -+ end = min(end, folio_end_pos(darray_last(folios))); -+ break; -+ } -+ -+ f_pos = folio_end_pos(f); -+ f_offset = 0; -+ } -+ -+ if (mapping_writably_mapped(mapping)) -+ darray_for_each(folios, fi) -+ flush_dcache_folio(*fi); -+ -+ f_pos = pos; -+ f_offset = pos - folio_pos(darray_first(folios)); -+ darray_for_each(folios, fi) { -+ struct folio *f = *fi; -+ u64 f_len = min(end, folio_end_pos(f)) - f_pos; -+ unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); -+ -+ if (!f_copied) { -+ folios_trunc(&folios, fi); -+ break; -+ } -+ -+ if (!folio_test_uptodate(f) && -+ f_copied != folio_size(f) && -+ pos + copied + f_copied < inode->v.i_size) { -+ iov_iter_revert(iter, f_copied); -+ folio_zero_range(f, 0, folio_size(f)); -+ folios_trunc(&folios, fi); -+ break; -+ } -+ -+ flush_dcache_folio(f); -+ copied += f_copied; -+ -+ if (f_copied != f_len) { -+ folios_trunc(&folios, fi + 1); -+ break; -+ } -+ -+ f_pos = folio_end_pos(f); -+ f_offset = 0; -+ } -+ -+ if (!copied) -+ goto out; -+ -+ end = pos + copied; -+ -+ spin_lock(&inode->v.i_lock); -+ if (end > inode->v.i_size) -+ i_size_write(&inode->v, end); -+ spin_unlock(&inode->v.i_lock); -+ -+ f_pos = pos; -+ f_offset = pos - folio_pos(darray_first(folios)); -+ darray_for_each(folios, fi) { -+ struct folio *f = *fi; -+ u64 f_len = min(end, folio_end_pos(f)) - f_pos; -+ -+ if (!folio_test_uptodate(f)) -+ folio_mark_uptodate(f); -+ -+ bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); -+ -+ f_pos = folio_end_pos(f); -+ f_offset = 0; -+ } -+ -+ inode->ei_last_dirtied = (unsigned long) current; -+out: -+ darray_for_each(folios, fi) { -+ folio_unlock(*fi); -+ folio_put(*fi); -+ } -+ -+ /* -+ * If the last folio added to the mapping starts beyond current EOF, we -+ * performed a short write but left around at least one post-EOF folio. -+ * Clean up the mapping before we return. -+ */ -+ if (last_folio_pos >= inode->v.i_size) -+ truncate_pagecache(&inode->v, inode->v.i_size); -+ -+ darray_exit(&folios); -+ bch2_folio_reservation_put(c, inode, &res); -+ -+ return copied ?: ret; -+} -+ -+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ loff_t pos = iocb->ki_pos; -+ ssize_t written = 0; -+ int ret = 0; -+ -+ bch2_pagecache_add_get(inode); -+ -+ do { -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned bytes = iov_iter_count(iter); -+again: -+ /* -+ * Bring in the user page that we will copy from _first_. -+ * Otherwise there's a nasty deadlock on copying from the -+ * same page as we're writing to, without it being marked -+ * up-to-date. -+ * -+ * Not only is this an optimisation, but it is also required -+ * to check that the address is actually valid, when atomic -+ * usercopies are used, below. -+ */ -+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { -+ bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE - offset); -+ -+ if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { -+ ret = -EFAULT; -+ break; -+ } -+ } -+ -+ if (unlikely(fatal_signal_pending(current))) { -+ ret = -EINTR; -+ break; -+ } -+ -+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); -+ if (unlikely(ret < 0)) -+ break; -+ -+ cond_resched(); -+ -+ if (unlikely(ret == 0)) { -+ /* -+ * If we were unable to copy any data at all, we must -+ * fall back to a single segment length write. -+ * -+ * If we didn't fallback here, we could livelock -+ * because not all segments in the iov can be copied at -+ * once without a pagefault. -+ */ -+ bytes = min_t(unsigned long, PAGE_SIZE - offset, -+ iov_iter_single_seg_count(iter)); -+ goto again; -+ } -+ pos += ret; -+ written += ret; -+ ret = 0; -+ -+ balance_dirty_pages_ratelimited(mapping); -+ } while (iov_iter_count(iter)); -+ -+ bch2_pagecache_add_put(inode); -+ -+ return written ? written : ret; -+} -+ -+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ ssize_t ret; -+ -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ ret = bch2_direct_write(iocb, from); -+ goto out; -+ } -+ -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto unlock; -+ -+ ret = file_remove_privs(file); -+ if (ret) -+ goto unlock; -+ -+ ret = file_update_time(file); -+ if (ret) -+ goto unlock; -+ -+ ret = bch2_buffered_write(iocb, from); -+ if (likely(ret > 0)) -+ iocb->ki_pos += ret; -+unlock: -+ inode_unlock(&inode->v); -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); -+out: -+ return bch2_err_class(ret); -+} -+ -+void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->writepage_bioset); -+} -+ -+int bch2_fs_fs_io_buffered_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->writepage_bioset, -+ 4, offsetof(struct bch_writepage_io, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_writepage_bioset_init; -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io-buffered.h b/fs/bcachefs/fs-io-buffered.h -new file mode 100644 -index 000000000..a6126ff79 ---- /dev/null -+++ b/fs/bcachefs/fs-io-buffered.h -@@ -0,0 +1,27 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_BUFFERED_H -+#define _BCACHEFS_FS_IO_BUFFERED_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+int bch2_read_single_folio(struct folio *, struct address_space *); -+int bch2_read_folio(struct file *, struct folio *); -+ -+int bch2_writepages(struct address_space *, struct writeback_control *); -+void bch2_readahead(struct readahead_control *); -+ -+int bch2_write_begin(struct file *, struct address_space *, loff_t, -+ unsigned, struct page **, void **); -+int bch2_write_end(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page *, void *); -+ -+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); -+ -+void bch2_fs_fs_io_buffered_exit(struct bch_fs *); -+int bch2_fs_fs_io_buffered_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fs_io_buffered_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fs_io_buffered_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_BUFFERED_H */ -diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c -new file mode 100644 -index 000000000..2b29abd24 ---- /dev/null -+++ b/fs/bcachefs/fs-io-direct.c -@@ -0,0 +1,679 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fs-io-direct.h" -+#include "fs-io-pagecache.h" -+#include "io.h" -+ -+#include -+#include -+#include -+ -+/* O_DIRECT reads */ -+ -+struct dio_read { -+ struct closure cl; -+ struct kiocb *req; -+ long ret; -+ bool should_dirty; -+ struct bch_read_bio rbio; -+}; -+ -+static void bio_check_or_release(struct bio *bio, bool check_dirty) -+{ -+ if (check_dirty) { -+ bio_check_pages_dirty(bio); -+ } else { -+ bio_release_pages(bio, false); -+ bio_put(bio); -+ } -+} -+ -+static void bch2_dio_read_complete(struct closure *cl) -+{ -+ struct dio_read *dio = container_of(cl, struct dio_read, cl); -+ -+ dio->req->ki_complete(dio->req, dio->ret); -+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -+} -+ -+static void bch2_direct_IO_read_endio(struct bio *bio) -+{ -+ struct dio_read *dio = bio->bi_private; -+ -+ if (bio->bi_status) -+ dio->ret = blk_status_to_errno(bio->bi_status); -+ -+ closure_put(&dio->cl); -+} -+ -+static void bch2_direct_IO_read_split_endio(struct bio *bio) -+{ -+ struct dio_read *dio = bio->bi_private; -+ bool should_dirty = dio->should_dirty; -+ -+ bch2_direct_IO_read_endio(bio); -+ bio_check_or_release(bio, should_dirty); -+} -+ -+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts; -+ struct dio_read *dio; -+ struct bio *bio; -+ loff_t offset = req->ki_pos; -+ bool sync = is_sync_kiocb(req); -+ size_t shorten; -+ ssize_t ret; -+ -+ bch2_inode_opts_get(&opts, c, &inode->ei_inode); -+ -+ if ((offset|iter->count) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ ret = min_t(loff_t, iter->count, -+ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); -+ -+ if (!ret) -+ return ret; -+ -+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); -+ iter->count -= shorten; -+ -+ bio = bio_alloc_bioset(NULL, -+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), -+ REQ_OP_READ, -+ GFP_KERNEL, -+ &c->dio_read_bioset); -+ -+ bio->bi_end_io = bch2_direct_IO_read_endio; -+ -+ dio = container_of(bio, struct dio_read, rbio.bio); -+ closure_init(&dio->cl, NULL); -+ -+ /* -+ * this is a _really_ horrible hack just to avoid an atomic sub at the -+ * end: -+ */ -+ if (!sync) { -+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER - -+ CLOSURE_RUNNING + -+ CLOSURE_DESTRUCTOR); -+ } else { -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER + 1); -+ } -+ -+ dio->req = req; -+ dio->ret = ret; -+ /* -+ * This is one of the sketchier things I've encountered: we have to skip -+ * the dirtying of requests that are internal from the kernel (i.e. from -+ * loopback), because we'll deadlock on page_lock. -+ */ -+ dio->should_dirty = iter_is_iovec(iter); -+ -+ goto start; -+ while (iter->count) { -+ bio = bio_alloc_bioset(NULL, -+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), -+ REQ_OP_READ, -+ GFP_KERNEL, -+ &c->bio_read); -+ bio->bi_end_io = bch2_direct_IO_read_split_endio; -+start: -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC; -+ bio->bi_iter.bi_sector = offset >> 9; -+ bio->bi_private = dio; -+ -+ ret = bio_iov_iter_get_pages(bio, iter); -+ if (ret < 0) { -+ /* XXX: fault inject this path */ -+ bio->bi_status = BLK_STS_RESOURCE; -+ bio_endio(bio); -+ break; -+ } -+ -+ offset += bio->bi_iter.bi_size; -+ -+ if (dio->should_dirty) -+ bio_set_pages_dirty(bio); -+ -+ if (iter->count) -+ closure_get(&dio->cl); -+ -+ bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); -+ } -+ -+ iter->count += shorten; -+ -+ if (sync) { -+ closure_sync(&dio->cl); -+ closure_debug_destroy(&dio->cl); -+ ret = dio->ret; -+ bio_check_or_release(&dio->rbio.bio, dio->should_dirty); -+ return ret; -+ } else { -+ return -EIOCBQUEUED; -+ } -+} -+ -+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ size_t count = iov_iter_count(iter); -+ ssize_t ret; -+ -+ if (!count) -+ return 0; /* skip atime */ -+ -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ struct blk_plug plug; -+ -+ if (unlikely(mapping->nrpages)) { -+ ret = filemap_write_and_wait_range(mapping, -+ iocb->ki_pos, -+ iocb->ki_pos + count - 1); -+ if (ret < 0) -+ goto out; -+ } -+ -+ file_accessed(file); -+ -+ blk_start_plug(&plug); -+ ret = bch2_direct_IO_read(iocb, iter); -+ blk_finish_plug(&plug); -+ -+ if (ret >= 0) -+ iocb->ki_pos += ret; -+ } else { -+ bch2_pagecache_add_get(inode); -+ ret = generic_file_read_iter(iocb, iter); -+ bch2_pagecache_add_put(inode); -+ } -+out: -+ return bch2_err_class(ret); -+} -+ -+/* O_DIRECT writes */ -+ -+struct dio_write { -+ struct kiocb *req; -+ struct address_space *mapping; -+ struct bch_inode_info *inode; -+ struct mm_struct *mm; -+ unsigned loop:1, -+ extending:1, -+ sync:1, -+ flush:1, -+ free_iov:1; -+ struct quota_res quota_res; -+ u64 written; -+ -+ struct iov_iter iter; -+ struct iovec inline_vecs[2]; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, -+ u64 offset, u64 size, -+ unsigned nr_replicas, bool compressed) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u64 end = offset + size; -+ u32 snapshot; -+ bool ret = true; -+ int err; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (err) -+ goto err; -+ -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, -+ SPOS(inum.inum, offset, snapshot), -+ BTREE_ITER_SLOTS, k, err) { -+ if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) -+ break; -+ -+ if (k.k->p.snapshot != snapshot || -+ nr_replicas > bch2_bkey_replicas(c, k) || -+ (!compressed && bch2_bkey_sectors_compressed(k))) { -+ ret = false; -+ break; -+ } -+ } -+ -+ offset = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(err, BCH_ERR_transaction_restart)) -+ goto retry; -+ bch2_trans_exit(&trans); -+ -+ return err ? false : ret; -+} -+ -+static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) -+{ -+ struct bch_fs *c = dio->op.c; -+ struct bch_inode_info *inode = dio->inode; -+ struct bio *bio = &dio->op.wbio.bio; -+ -+ return bch2_check_range_allocated(c, inode_inum(inode), -+ dio->op.pos.offset, bio_sectors(bio), -+ dio->op.opts.data_replicas, -+ dio->op.opts.compression != 0); -+} -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *); -+static __always_inline long bch2_dio_write_done(struct dio_write *dio); -+ -+/* -+ * We're going to return -EIOCBQUEUED, but we haven't finished consuming the -+ * iov_iter yet, so we need to stash a copy of the iovec: it might be on the -+ * caller's stack, we're not guaranteed that it will live for the duration of -+ * the IO: -+ */ -+static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) -+{ -+ struct iovec *iov = dio->inline_vecs; -+ -+ /* -+ * iov_iter has a single embedded iovec - nothing to do: -+ */ -+ if (iter_is_ubuf(&dio->iter)) -+ return 0; -+ -+ /* -+ * We don't currently handle non-iovec iov_iters here - return an error, -+ * and we'll fall back to doing the IO synchronously: -+ */ -+ if (!iter_is_iovec(&dio->iter)) -+ return -1; -+ -+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { -+ iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), -+ GFP_KERNEL); -+ if (unlikely(!iov)) -+ return -ENOMEM; -+ -+ dio->free_iov = true; -+ } -+ -+ memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); -+ dio->iter.__iov = iov; -+ return 0; -+} -+ -+static void bch2_dio_write_flush_done(struct closure *cl) -+{ -+ struct dio_write *dio = container_of(cl, struct dio_write, op.cl); -+ struct bch_fs *c = dio->op.c; -+ -+ closure_debug_destroy(cl); -+ -+ dio->op.error = bch2_journal_error(&c->journal); -+ -+ bch2_dio_write_done(dio); -+} -+ -+static noinline void bch2_dio_write_flush(struct dio_write *dio) -+{ -+ struct bch_fs *c = dio->op.c; -+ struct bch_inode_unpacked inode; -+ int ret; -+ -+ dio->flush = 0; -+ -+ closure_init(&dio->op.cl, NULL); -+ -+ if (!dio->op.error) { -+ ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); -+ if (ret) { -+ dio->op.error = ret; -+ } else { -+ bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, -+ &dio->op.cl); -+ bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); -+ } -+ } -+ -+ if (dio->sync) { -+ closure_sync(&dio->op.cl); -+ closure_debug_destroy(&dio->op.cl); -+ } else { -+ continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); -+ } -+} -+ -+static __always_inline long bch2_dio_write_done(struct dio_write *dio) -+{ -+ struct kiocb *req = dio->req; -+ struct bch_inode_info *inode = dio->inode; -+ bool sync = dio->sync; -+ long ret; -+ -+ if (unlikely(dio->flush)) { -+ bch2_dio_write_flush(dio); -+ if (!sync) -+ return -EIOCBQUEUED; -+ } -+ -+ bch2_pagecache_block_put(inode); -+ -+ if (dio->free_iov) -+ kfree(dio->iter.__iov); -+ -+ ret = dio->op.error ?: ((long) dio->written << 9); -+ bio_put(&dio->op.wbio.bio); -+ -+ /* inode->i_dio_count is our ref on inode and thus bch_fs */ -+ inode_dio_end(&inode->v); -+ -+ if (ret < 0) -+ ret = bch2_err_class(ret); -+ -+ if (!sync) { -+ req->ki_complete(req, ret); -+ ret = -EIOCBQUEUED; -+ } -+ return ret; -+} -+ -+static __always_inline void bch2_dio_write_end(struct dio_write *dio) -+{ -+ struct bch_fs *c = dio->op.c; -+ struct kiocb *req = dio->req; -+ struct bch_inode_info *inode = dio->inode; -+ struct bio *bio = &dio->op.wbio.bio; -+ -+ req->ki_pos += (u64) dio->op.written << 9; -+ dio->written += dio->op.written; -+ -+ if (dio->extending) { -+ spin_lock(&inode->v.i_lock); -+ if (req->ki_pos > inode->v.i_size) -+ i_size_write(&inode->v, req->ki_pos); -+ spin_unlock(&inode->v.i_lock); -+ } -+ -+ if (dio->op.i_sectors_delta || dio->quota_res.sectors) { -+ mutex_lock(&inode->ei_quota_lock); -+ __bch2_i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); -+ __bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ mutex_unlock(&inode->ei_quota_lock); -+ } -+ -+ bio_release_pages(bio, false); -+ -+ if (unlikely(dio->op.error)) -+ set_bit(EI_INODE_ERROR, &inode->ei_flags); -+} -+ -+static __always_inline long bch2_dio_write_loop(struct dio_write *dio) -+{ -+ struct bch_fs *c = dio->op.c; -+ struct kiocb *req = dio->req; -+ struct address_space *mapping = dio->mapping; -+ struct bch_inode_info *inode = dio->inode; -+ struct bch_io_opts opts; -+ struct bio *bio = &dio->op.wbio.bio; -+ unsigned unaligned, iter_count; -+ bool sync = dio->sync, dropped_locks; -+ long ret; -+ -+ bch2_inode_opts_get(&opts, c, &inode->ei_inode); -+ -+ while (1) { -+ iter_count = dio->iter.count; -+ -+ EBUG_ON(current->faults_disabled_mapping); -+ current->faults_disabled_mapping = mapping; -+ -+ ret = bio_iov_iter_get_pages(bio, &dio->iter); -+ -+ dropped_locks = fdm_dropped_locks(); -+ -+ current->faults_disabled_mapping = NULL; -+ -+ /* -+ * If the fault handler returned an error but also signalled -+ * that it dropped & retook ei_pagecache_lock, we just need to -+ * re-shoot down the page cache and retry: -+ */ -+ if (dropped_locks && ret) -+ ret = 0; -+ -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ if (unlikely(dropped_locks)) { -+ ret = bch2_write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter_count - 1); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (!bio->bi_iter.bi_size) -+ continue; -+ } -+ -+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); -+ bio->bi_iter.bi_size -= unaligned; -+ iov_iter_revert(&dio->iter, unaligned); -+ -+ if (!bio->bi_iter.bi_size) { -+ /* -+ * bio_iov_iter_get_pages was only able to get < -+ * blocksize worth of pages: -+ */ -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ bch2_write_op_init(&dio->op, c, opts); -+ dio->op.end_io = sync -+ ? NULL -+ : bch2_dio_write_loop_async; -+ dio->op.target = dio->op.opts.foreground_target; -+ dio->op.write_point = writepoint_hashed((unsigned long) current); -+ dio->op.nr_replicas = dio->op.opts.data_replicas; -+ dio->op.subvol = inode->ei_subvol; -+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); -+ dio->op.devs_need_flush = &inode->ei_devs_need_flush; -+ -+ if (sync) -+ dio->op.flags |= BCH_WRITE_SYNC; -+ dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; -+ -+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, -+ bio_sectors(bio), true); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), -+ dio->op.opts.data_replicas, 0); -+ if (unlikely(ret) && -+ !bch2_dio_write_check_allocated(dio)) -+ goto err; -+ -+ task_io_account_write(bio->bi_iter.bi_size); -+ -+ if (unlikely(dio->iter.count) && -+ !dio->sync && -+ !dio->loop && -+ bch2_dio_write_copy_iov(dio)) -+ dio->sync = sync = true; -+ -+ dio->loop = true; -+ closure_call(&dio->op.cl, bch2_write, NULL, NULL); -+ -+ if (!sync) -+ return -EIOCBQUEUED; -+ -+ bch2_dio_write_end(dio); -+ -+ if (likely(!dio->iter.count) || dio->op.error) -+ break; -+ -+ bio_reset(bio, NULL, REQ_OP_WRITE); -+ } -+out: -+ return bch2_dio_write_done(dio); -+err: -+ dio->op.error = ret; -+ -+ bio_release_pages(bio, false); -+ -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ goto out; -+} -+ -+static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) -+{ -+ struct mm_struct *mm = dio->mm; -+ -+ bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); -+ -+ if (mm) -+ kthread_use_mm(mm); -+ bch2_dio_write_loop(dio); -+ if (mm) -+ kthread_unuse_mm(mm); -+} -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *op) -+{ -+ struct dio_write *dio = container_of(op, struct dio_write, op); -+ -+ bch2_dio_write_end(dio); -+ -+ if (likely(!dio->iter.count) || dio->op.error) -+ bch2_dio_write_done(dio); -+ else -+ bch2_dio_write_continue(dio); -+} -+ -+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct dio_write *dio; -+ struct bio *bio; -+ bool locked = true, extending; -+ ssize_t ret; -+ -+ prefetch(&c->opts); -+ prefetch((void *) &c->opts + 64); -+ prefetch(&inode->ei_inode); -+ prefetch((void *) &inode->ei_inode + 64); -+ -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(req, iter); -+ if (unlikely(ret <= 0)) -+ goto err; -+ -+ ret = file_remove_privs(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = file_update_time(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) -+ goto err; -+ -+ inode_dio_begin(&inode->v); -+ bch2_pagecache_block_get(inode); -+ -+ extending = req->ki_pos + iter->count > inode->v.i_size; -+ if (!extending) { -+ inode_unlock(&inode->v); -+ locked = false; -+ } -+ -+ bio = bio_alloc_bioset(NULL, -+ bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), -+ REQ_OP_WRITE, -+ GFP_KERNEL, -+ &c->dio_write_bioset); -+ dio = container_of(bio, struct dio_write, op.wbio.bio); -+ dio->req = req; -+ dio->mapping = mapping; -+ dio->inode = inode; -+ dio->mm = current->mm; -+ dio->loop = false; -+ dio->extending = extending; -+ dio->sync = is_sync_kiocb(req) || extending; -+ dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; -+ dio->free_iov = false; -+ dio->quota_res.sectors = 0; -+ dio->written = 0; -+ dio->iter = *iter; -+ dio->op.c = c; -+ -+ if (unlikely(mapping->nrpages)) { -+ ret = bch2_write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter->count - 1); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ } -+ -+ ret = bch2_dio_write_loop(dio); -+err: -+ if (locked) -+ inode_unlock(&inode->v); -+ return ret; -+err_put_bio: -+ bch2_pagecache_block_put(inode); -+ bio_put(bio); -+ inode_dio_end(&inode->v); -+ goto err; -+} -+ -+void bch2_fs_fs_io_direct_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->dio_write_bioset); -+ bioset_exit(&c->dio_read_bioset); -+} -+ -+int bch2_fs_fs_io_direct_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->dio_read_bioset, -+ 4, offsetof(struct dio_read, rbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_dio_read_bioset_init; -+ -+ if (bioset_init(&c->dio_write_bioset, -+ 4, offsetof(struct dio_write, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_dio_write_bioset_init; -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io-direct.h b/fs/bcachefs/fs-io-direct.h -new file mode 100644 -index 000000000..814621ec7 ---- /dev/null -+++ b/fs/bcachefs/fs-io-direct.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_DIRECT_H -+#define _BCACHEFS_FS_IO_DIRECT_H -+ -+#ifndef NO_BCACHEFS_FS -+ssize_t bch2_direct_write(struct kiocb *, struct iov_iter *); -+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -+ -+void bch2_fs_fs_io_direct_exit(struct bch_fs *); -+int bch2_fs_fs_io_direct_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fs_io_direct_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fs_io_direct_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_DIRECT_H */ -diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c -new file mode 100644 -index 000000000..1e60eead2 ---- /dev/null -+++ b/fs/bcachefs/fs-io-pagecache.c -@@ -0,0 +1,788 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "btree_iter.h" -+#include "extents.h" -+#include "fs-io.h" -+#include "fs-io-pagecache.h" -+#include "subvolume.h" -+ -+#include -+#include -+ -+int bch2_filemap_get_contig_folios_d(struct address_space *mapping, -+ loff_t start, u64 end, -+ int fgp_flags, gfp_t gfp, -+ folios *folios) -+{ -+ struct folio *f; -+ u64 pos = start; -+ int ret = 0; -+ -+ while (pos < end) { -+ if ((u64) pos >= (u64) start + (1ULL << 20)) -+ fgp_flags &= ~FGP_CREAT; -+ -+ ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); -+ if (ret) -+ break; -+ -+ f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); -+ if (IS_ERR_OR_NULL(f)) -+ break; -+ -+ BUG_ON(folios->nr && folio_pos(f) != pos); -+ -+ pos = folio_end_pos(f); -+ darray_push(folios, f); -+ } -+ -+ if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) -+ ret = -ENOMEM; -+ -+ return folios->nr ? 0 : ret; -+} -+ -+/* pagecache_block must be held */ -+int bch2_write_invalidate_inode_pages_range(struct address_space *mapping, -+ loff_t start, loff_t end) -+{ -+ int ret; -+ -+ /* -+ * XXX: the way this is currently implemented, we can spin if a process -+ * is continually redirtying a specific page -+ */ -+ do { -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = filemap_write_and_wait_range(mapping, start, end); -+ if (ret) -+ break; -+ -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = invalidate_inode_pages2_range(mapping, -+ start >> PAGE_SHIFT, -+ end >> PAGE_SHIFT); -+ } while (ret == -EBUSY); -+ -+ return ret; -+} -+ -+static const char * const bch2_folio_sector_states[] = { -+#define x(n) #n, -+ BCH_FOLIO_SECTOR_STATE() -+#undef x -+ NULL -+}; -+ -+static inline enum bch_folio_sector_state -+folio_sector_dirty(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_unallocated: -+ return SECTOR_dirty; -+ case SECTOR_reserved: -+ return SECTOR_dirty_reserved; -+ default: -+ return state; -+ } -+} -+ -+static inline enum bch_folio_sector_state -+folio_sector_undirty(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_dirty: -+ return SECTOR_unallocated; -+ case SECTOR_dirty_reserved: -+ return SECTOR_reserved; -+ default: -+ return state; -+ } -+} -+ -+static inline enum bch_folio_sector_state -+folio_sector_reserve(enum bch_folio_sector_state state) -+{ -+ switch (state) { -+ case SECTOR_unallocated: -+ return SECTOR_reserved; -+ case SECTOR_dirty: -+ return SECTOR_dirty_reserved; -+ default: -+ return state; -+ } -+} -+ -+/* for newly allocated folios: */ -+struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) -+{ -+ struct bch_folio *s; -+ -+ s = kzalloc(sizeof(*s) + -+ sizeof(struct bch_folio_sector) * -+ folio_sectors(folio), gfp); -+ if (!s) -+ return NULL; -+ -+ spin_lock_init(&s->lock); -+ folio_attach_private(folio, s); -+ return s; -+} -+ -+struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) -+{ -+ return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); -+} -+ -+static unsigned bkey_to_sector_state(struct bkey_s_c k) -+{ -+ if (bkey_extent_is_reservation(k)) -+ return SECTOR_reserved; -+ if (bkey_extent_is_allocation(k.k)) -+ return SECTOR_allocated; -+ return SECTOR_unallocated; -+} -+ -+static void __bch2_folio_set(struct folio *folio, -+ unsigned pg_offset, unsigned pg_len, -+ unsigned nr_ptrs, unsigned state) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, sectors = folio_sectors(folio); -+ -+ BUG_ON(pg_offset >= sectors); -+ BUG_ON(pg_offset + pg_len > sectors); -+ -+ spin_lock(&s->lock); -+ -+ for (i = pg_offset; i < pg_offset + pg_len; i++) { -+ s->s[i].nr_replicas = nr_ptrs; -+ bch2_folio_sector_set(folio, s, i, state); -+ } -+ -+ if (i == sectors) -+ s->uptodate = true; -+ -+ spin_unlock(&s->lock); -+} -+ -+/* -+ * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the -+ * extents btree: -+ */ -+int bch2_folio_set(struct bch_fs *c, subvol_inum inum, -+ struct folio **folios, unsigned nr_folios) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_folio *s; -+ u64 offset = folio_sector(folios[0]); -+ unsigned folio_idx; -+ u32 snapshot; -+ bool need_set = false; -+ int ret; -+ -+ for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { -+ s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); -+ if (!s) -+ return -ENOMEM; -+ -+ need_set |= !s->uptodate; -+ } -+ -+ if (!need_set) -+ return 0; -+ -+ folio_idx = 0; -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, -+ SPOS(inum.inum, offset, snapshot), -+ BTREE_ITER_SLOTS, k, ret) { -+ unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = bkey_to_sector_state(k); -+ -+ while (folio_idx < nr_folios) { -+ struct folio *folio = folios[folio_idx]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - -+ folio_start; -+ unsigned folio_len = min(k.k->p.offset, folio_end) - -+ folio_offset - folio_start; -+ -+ BUG_ON(k.k->p.offset < folio_start); -+ BUG_ON(bkey_start_offset(k.k) > folio_end); -+ -+ if (!bch2_folio(folio)->uptodate) -+ __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); -+ -+ if (k.k->p.offset < folio_end) -+ break; -+ folio_idx++; -+ } -+ -+ if (folio_idx == nr_folios) -+ break; -+ } -+ -+ offset = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) -+{ -+ struct bvec_iter iter; -+ struct folio_vec fv; -+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v -+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = bkey_to_sector_state(k); -+ -+ bio_for_each_folio(fv, bio, iter) -+ __bch2_folio_set(fv.fv_folio, -+ fv.fv_offset >> 9, -+ fv.fv_len >> 9, -+ nr_ptrs, state); -+} -+ -+void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode, -+ u64 start, u64 end) -+{ -+ pgoff_t index = start >> PAGE_SECTORS_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; -+ struct folio_batch fbatch; -+ unsigned i, j; -+ -+ if (end <= start) -+ return; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(inode->v.i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(start, folio_start) - folio_start; -+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; -+ struct bch_folio *s; -+ -+ BUG_ON(end <= folio_start); -+ -+ folio_lock(folio); -+ s = bch2_folio(folio); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = folio_offset; j < folio_offset + folio_len; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+} -+ -+void bch2_mark_pagecache_reserved(struct bch_inode_info *inode, -+ u64 start, u64 end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ pgoff_t index = start >> PAGE_SECTORS_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; -+ struct folio_batch fbatch; -+ s64 i_sectors_delta = 0; -+ unsigned i, j; -+ -+ if (end <= start) -+ return; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(inode->v.i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ u64 folio_start = folio_sector(folio); -+ u64 folio_end = folio_end_sector(folio); -+ unsigned folio_offset = max(start, folio_start) - folio_start; -+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; -+ struct bch_folio *s; -+ -+ BUG_ON(end <= folio_start); -+ -+ folio_lock(folio); -+ s = bch2_folio(folio); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = folio_offset; j < folio_offset + folio_len; j++) { -+ i_sectors_delta -= s->s[j].state == SECTOR_dirty; -+ bch2_folio_sector_set(folio, s, j, -+ folio_sector_reserve(s->s[j].state)); -+ } -+ spin_unlock(&s->lock); -+ } -+ -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+ -+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); -+} -+ -+static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, -+ unsigned nr_replicas) -+{ -+ return max(0, (int) nr_replicas - -+ s->nr_replicas - -+ s->replicas_reserved); -+} -+ -+int bch2_get_folio_disk_reservation(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, bool check_enospc) -+{ -+ struct bch_folio *s = bch2_folio_create(folio, 0); -+ unsigned nr_replicas = inode_nr_replicas(c, inode); -+ struct disk_reservation disk_res = { 0 }; -+ unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = 0; i < sectors; i++) -+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ if (!disk_res_sectors) -+ return 0; -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ disk_res_sectors, 1, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ -+ for (i = 0; i < sectors; i++) -+ s->s[i].replicas_reserved += -+ sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ return 0; -+} -+ -+void bch2_folio_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_folio_reservation *res) -+{ -+ bch2_disk_reservation_put(c, &res->disk); -+ bch2_quota_reservation_put(c, inode, &res->quota); -+} -+ -+int bch2_folio_reservation_get(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, -+ struct bch2_folio_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_folio *s = bch2_folio_create(folio, 0); -+ unsigned i, disk_sectors = 0, quota_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ BUG_ON(!s->uptodate); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ disk_sectors += sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ quota_sectors += s->s[i].state == SECTOR_unallocated; -+ } -+ -+ if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ if (quota_sectors) { -+ ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, true); -+ if (unlikely(ret)) { -+ struct disk_reservation tmp = { -+ .sectors = disk_sectors -+ }; -+ -+ bch2_disk_reservation_put(c, &tmp); -+ res->disk.sectors -= disk_sectors; -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_clear_folio_bits(struct folio *folio) -+{ -+ struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_folio *s = bch2_folio(folio); -+ struct disk_reservation disk_res = { 0 }; -+ int i, sectors = folio_sectors(folio), dirty_sectors = 0; -+ -+ if (!s) -+ return; -+ -+ EBUG_ON(!folio_test_locked(folio)); -+ EBUG_ON(folio_test_writeback(folio)); -+ -+ for (i = 0; i < sectors; i++) { -+ disk_res.sectors += s->s[i].replicas_reserved; -+ s->s[i].replicas_reserved = 0; -+ -+ dirty_sectors -= s->s[i].state == SECTOR_dirty; -+ bch2_folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); -+ } -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ bch2_i_sectors_acct(c, inode, NULL, dirty_sectors); -+ -+ bch2_folio_release(folio); -+} -+ -+void bch2_set_folio_dirty(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct folio *folio, -+ struct bch2_folio_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, dirty_sectors = 0; -+ -+ WARN_ON((u64) folio_pos(folio) + offset + len > -+ round_up((u64) i_size_read(&inode->v), block_bytes(c))); -+ -+ BUG_ON(!s->uptodate); -+ -+ spin_lock(&s->lock); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ unsigned sectors = sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ -+ /* -+ * This can happen if we race with the error path in -+ * bch2_writepage_io_done(): -+ */ -+ sectors = min_t(unsigned, sectors, res->disk.sectors); -+ -+ s->s[i].replicas_reserved += sectors; -+ res->disk.sectors -= sectors; -+ -+ dirty_sectors += s->s[i].state == SECTOR_unallocated; -+ -+ bch2_folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); -+ } -+ -+ spin_unlock(&s->lock); -+ -+ bch2_i_sectors_acct(c, inode, &res->quota, dirty_sectors); -+ -+ if (!folio_test_dirty(folio)) -+ filemap_dirty_folio(inode->v.i_mapping, folio); -+} -+ -+vm_fault_t bch2_page_fault(struct vm_fault *vmf) -+{ -+ struct file *file = vmf->vma->vm_file; -+ struct address_space *mapping = file->f_mapping; -+ struct address_space *fdm = faults_disabled_mapping(); -+ struct bch_inode_info *inode = file_bch_inode(file); -+ vm_fault_t ret; -+ -+ if (fdm == mapping) -+ return VM_FAULT_SIGBUS; -+ -+ /* Lock ordering: */ -+ if (fdm > mapping) { -+ struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); -+ -+ if (bch2_pagecache_add_tryget(inode)) -+ goto got_lock; -+ -+ bch2_pagecache_block_put(fdm_host); -+ -+ bch2_pagecache_add_get(inode); -+ bch2_pagecache_add_put(inode); -+ -+ bch2_pagecache_block_get(fdm_host); -+ -+ /* Signal that lock has been dropped: */ -+ set_fdm_dropped_locks(); -+ return VM_FAULT_SIGBUS; -+ } -+ -+ bch2_pagecache_add_get(inode); -+got_lock: -+ ret = filemap_fault(vmf); -+ bch2_pagecache_add_put(inode); -+ -+ return ret; -+} -+ -+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -+{ -+ struct folio *folio = page_folio(vmf->page); -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_folio_reservation res; -+ unsigned len; -+ loff_t isize; -+ vm_fault_t ret; -+ -+ bch2_folio_reservation_init(c, inode, &res); -+ -+ sb_start_pagefault(inode->v.i_sb); -+ file_update_time(file); -+ -+ /* -+ * Not strictly necessary, but helps avoid dio writes livelocking in -+ * bch2_write_invalidate_inode_pages_range() - can drop this if/when we get -+ * a bch2_write_invalidate_inode_pages_range() that works without dropping -+ * page lock before invalidating page -+ */ -+ bch2_pagecache_add_get(inode); -+ -+ folio_lock(folio); -+ isize = i_size_read(&inode->v); -+ -+ if (folio->mapping != mapping || folio_pos(folio) >= isize) { -+ folio_unlock(folio); -+ ret = VM_FAULT_NOPAGE; -+ goto out; -+ } -+ -+ len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); -+ -+ if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: -+ bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { -+ folio_unlock(folio); -+ ret = VM_FAULT_SIGBUS; -+ goto out; -+ } -+ -+ bch2_set_folio_dirty(c, inode, folio, &res, 0, len); -+ bch2_folio_reservation_put(c, inode, &res); -+ -+ folio_wait_stable(folio); -+ ret = VM_FAULT_LOCKED; -+out: -+ bch2_pagecache_add_put(inode); -+ sb_end_pagefault(inode->v.i_sb); -+ -+ return ret; -+} -+ -+void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) -+{ -+ if (offset || length < folio_size(folio)) -+ return; -+ -+ bch2_clear_folio_bits(folio); -+} -+ -+bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) -+{ -+ if (folio_test_dirty(folio) || folio_test_writeback(folio)) -+ return false; -+ -+ bch2_clear_folio_bits(folio); -+ return true; -+} -+ -+/* fseek: */ -+ -+static int folio_data_offset(struct folio *folio, loff_t pos, -+ unsigned min_replicas) -+{ -+ struct bch_folio *s = bch2_folio(folio); -+ unsigned i, sectors = folio_sectors(folio); -+ -+ if (s) -+ for (i = folio_pos_to_s(folio, pos); i < sectors; i++) -+ if (s->s[i].state >= SECTOR_dirty && -+ s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) -+ return i << SECTOR_SHIFT; -+ -+ return -1; -+} -+ -+loff_t bch2_seek_pagecache_data(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset, -+ unsigned min_replicas, -+ bool nonblock) -+{ -+ struct folio_batch fbatch; -+ pgoff_t start_index = start_offset >> PAGE_SHIFT; -+ pgoff_t end_index = end_offset >> PAGE_SHIFT; -+ pgoff_t index = start_index; -+ unsigned i; -+ loff_t ret; -+ int offset; -+ -+ folio_batch_init(&fbatch); -+ -+ while (filemap_get_folios(vinode->i_mapping, -+ &index, end_index, &fbatch)) { -+ for (i = 0; i < folio_batch_count(&fbatch); i++) { -+ struct folio *folio = fbatch.folios[i]; -+ -+ if (!nonblock) { -+ folio_lock(folio); -+ } else if (!folio_trylock(folio)) { -+ folio_batch_release(&fbatch); -+ return -EAGAIN; -+ } -+ -+ offset = folio_data_offset(folio, -+ max(folio_pos(folio), start_offset), -+ min_replicas); -+ if (offset >= 0) { -+ ret = clamp(folio_pos(folio) + offset, -+ start_offset, end_offset); -+ folio_unlock(folio); -+ folio_batch_release(&fbatch); -+ return ret; -+ } -+ folio_unlock(folio); -+ } -+ folio_batch_release(&fbatch); -+ cond_resched(); -+ } -+ -+ return end_offset; -+} -+ -+/* -+ * Search for a hole in a folio. -+ * -+ * The filemap layer returns -ENOENT if no folio exists, so reuse the same error -+ * code to indicate a pagecache hole exists at the returned offset. Otherwise -+ * return 0 if the folio is filled with data, or an error code. This function -+ * can return -EAGAIN if nonblock is specified. -+ */ -+static int folio_hole_offset(struct address_space *mapping, loff_t *offset, -+ unsigned min_replicas, bool nonblock) -+{ -+ struct folio *folio; -+ struct bch_folio *s; -+ unsigned i, sectors; -+ int ret = -ENOENT; -+ -+ folio = __filemap_get_folio(mapping, *offset >> PAGE_SHIFT, -+ FGP_LOCK|(nonblock ? FGP_NOWAIT : 0), 0); -+ if (IS_ERR(folio)) -+ return PTR_ERR(folio); -+ -+ s = bch2_folio(folio); -+ if (!s) -+ goto unlock; -+ -+ sectors = folio_sectors(folio); -+ for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) -+ if (s->s[i].state < SECTOR_dirty || -+ s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { -+ *offset = max(*offset, -+ folio_pos(folio) + (i << SECTOR_SHIFT)); -+ goto unlock; -+ } -+ -+ *offset = folio_end_pos(folio); -+ ret = 0; -+unlock: -+ folio_unlock(folio); -+ folio_put(folio); -+ return ret; -+} -+ -+loff_t bch2_seek_pagecache_hole(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset, -+ unsigned min_replicas, -+ bool nonblock) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ loff_t offset = start_offset; -+ loff_t ret = 0; -+ -+ while (!ret && offset < end_offset) -+ ret = folio_hole_offset(mapping, &offset, min_replicas, nonblock); -+ -+ if (ret && ret != -ENOENT) -+ return ret; -+ return min(offset, end_offset); -+} -+ -+int bch2_clamp_data_hole(struct inode *inode, -+ u64 *hole_start, -+ u64 *hole_end, -+ unsigned min_replicas, -+ bool nonblock) -+{ -+ loff_t ret; -+ -+ ret = bch2_seek_pagecache_hole(inode, -+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; -+ if (ret < 0) -+ return ret; -+ -+ *hole_start = ret; -+ -+ if (*hole_start == *hole_end) -+ return 0; -+ -+ ret = bch2_seek_pagecache_data(inode, -+ *hole_start << 9, *hole_end << 9, min_replicas, nonblock) >> 9; -+ if (ret < 0) -+ return ret; -+ -+ *hole_end = ret; -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h -new file mode 100644 -index 000000000..a2222ad58 ---- /dev/null -+++ b/fs/bcachefs/fs-io-pagecache.h -@@ -0,0 +1,176 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_PAGECACHE_H -+#define _BCACHEFS_FS_IO_PAGECACHE_H -+ -+#include -+ -+typedef DARRAY(struct folio *) folios; -+ -+int bch2_filemap_get_contig_folios_d(struct address_space *, loff_t, -+ u64, int, gfp_t, folios *); -+int bch2_write_invalidate_inode_pages_range(struct address_space *, loff_t, loff_t); -+ -+/* -+ * Use u64 for the end pos and sector helpers because if the folio covers the -+ * max supported range of the mapping, the start offset of the next folio -+ * overflows loff_t. This breaks much of the range based processing in the -+ * buffered write path. -+ */ -+static inline u64 folio_end_pos(struct folio *folio) -+{ -+ return folio_pos(folio) + folio_size(folio); -+} -+ -+static inline size_t folio_sectors(struct folio *folio) -+{ -+ return PAGE_SECTORS << folio_order(folio); -+} -+ -+static inline loff_t folio_sector(struct folio *folio) -+{ -+ return folio_pos(folio) >> 9; -+} -+ -+static inline u64 folio_end_sector(struct folio *folio) -+{ -+ return folio_end_pos(folio) >> 9; -+} -+ -+#define BCH_FOLIO_SECTOR_STATE() \ -+ x(unallocated) \ -+ x(reserved) \ -+ x(dirty) \ -+ x(dirty_reserved) \ -+ x(allocated) -+ -+enum bch_folio_sector_state { -+#define x(n) SECTOR_##n, -+ BCH_FOLIO_SECTOR_STATE() -+#undef x -+}; -+ -+struct bch_folio_sector { -+ /* Uncompressed, fully allocated replicas (or on disk reservation): */ -+ unsigned nr_replicas:4; -+ -+ /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ -+ unsigned replicas_reserved:4; -+ -+ /* i_sectors: */ -+ enum bch_folio_sector_state state:8; -+}; -+ -+struct bch_folio { -+ spinlock_t lock; -+ atomic_t write_count; -+ /* -+ * Is the sector state up to date with the btree? -+ * (Not the data itself) -+ */ -+ bool uptodate; -+ struct bch_folio_sector s[]; -+}; -+ -+/* Helper for when we need to add debug instrumentation: */ -+static inline void bch2_folio_sector_set(struct folio *folio, -+ struct bch_folio *s, -+ unsigned i, unsigned n) -+{ -+ s->s[i].state = n; -+} -+ -+/* file offset (to folio offset) to bch_folio_sector index */ -+static inline int folio_pos_to_s(struct folio *folio, loff_t pos) -+{ -+ u64 f_offset = pos - folio_pos(folio); -+ -+ BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); -+ return f_offset >> SECTOR_SHIFT; -+} -+ -+/* for newly allocated folios: */ -+static inline void __bch2_folio_release(struct folio *folio) -+{ -+ kfree(folio_detach_private(folio)); -+} -+ -+static inline void bch2_folio_release(struct folio *folio) -+{ -+ EBUG_ON(!folio_test_locked(folio)); -+ __bch2_folio_release(folio); -+} -+ -+static inline struct bch_folio *__bch2_folio(struct folio *folio) -+{ -+ return folio_has_private(folio) -+ ? (struct bch_folio *) folio_get_private(folio) -+ : NULL; -+} -+ -+static inline struct bch_folio *bch2_folio(struct folio *folio) -+{ -+ EBUG_ON(!folio_test_locked(folio)); -+ -+ return __bch2_folio(folio); -+} -+ -+struct bch_folio *__bch2_folio_create(struct folio *, gfp_t); -+struct bch_folio *bch2_folio_create(struct folio *, gfp_t); -+ -+struct bch2_folio_reservation { -+ struct disk_reservation disk; -+ struct quota_res quota; -+}; -+ -+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ /* XXX: this should not be open coded */ -+ return inode->ei_inode.bi_data_replicas -+ ? inode->ei_inode.bi_data_replicas - 1 -+ : c->opts.data_replicas; -+} -+ -+static inline void bch2_folio_reservation_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_folio_reservation *res) -+{ -+ memset(res, 0, sizeof(*res)); -+ -+ res->disk.nr_replicas = inode_nr_replicas(c, inode); -+} -+ -+int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned); -+void bch2_bio_page_state_set(struct bio *, struct bkey_s_c); -+ -+void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64); -+void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64); -+ -+int bch2_get_folio_disk_reservation(struct bch_fs *, -+ struct bch_inode_info *, -+ struct folio *, bool); -+ -+void bch2_folio_reservation_put(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch2_folio_reservation *); -+int bch2_folio_reservation_get(struct bch_fs *, -+ struct bch_inode_info *, -+ struct folio *, -+ struct bch2_folio_reservation *, -+ unsigned, unsigned); -+ -+void bch2_set_folio_dirty(struct bch_fs *, -+ struct bch_inode_info *, -+ struct folio *, -+ struct bch2_folio_reservation *, -+ unsigned, unsigned); -+ -+vm_fault_t bch2_page_fault(struct vm_fault *); -+vm_fault_t bch2_page_mkwrite(struct vm_fault *); -+void bch2_invalidate_folio(struct folio *, size_t, size_t); -+bool bch2_release_folio(struct folio *, gfp_t); -+ -+loff_t bch2_seek_pagecache_data(struct inode *, loff_t, loff_t, unsigned, bool); -+loff_t bch2_seek_pagecache_hole(struct inode *, loff_t, loff_t, unsigned, bool); -+int bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned, bool); -+ -+#endif /* _BCACHEFS_FS_IO_PAGECACHE_H */ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -new file mode 100644 -index 000000000..ceab12fb8 ---- /dev/null -+++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,1250 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fs-io-buffered.h" -+#include "fs-io-pagecache.h" -+#include "fsck.h" -+#include "inode.h" -+#include "journal.h" -+#include "io.h" -+#include "keylist.h" -+#include "quota.h" -+#include "reflink.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+struct nocow_flush { -+ struct closure *cl; -+ struct bch_dev *ca; -+ struct bio bio; -+}; -+ -+static void nocow_flush_endio(struct bio *_bio) -+{ -+ -+ struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); -+ -+ closure_put(bio->cl); -+ percpu_ref_put(&bio->ca->io_ref); -+ bio_put(&bio->bio); -+} -+ -+void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct closure *cl) -+{ -+ struct nocow_flush *bio; -+ struct bch_dev *ca; -+ struct bch_devs_mask devs; -+ unsigned dev; -+ -+ dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); -+ if (dev == BCH_SB_MEMBERS_MAX) -+ return; -+ -+ devs = inode->ei_devs_need_flush; -+ memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); -+ -+ for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca && !percpu_ref_tryget(&ca->io_ref)) -+ ca = NULL; -+ rcu_read_unlock(); -+ -+ if (!ca) -+ continue; -+ -+ bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, -+ REQ_OP_FLUSH, -+ GFP_KERNEL, -+ &c->nocow_flush_bioset), -+ struct nocow_flush, bio); -+ bio->cl = cl; -+ bio->ca = ca; -+ bio->bio.bi_end_io = nocow_flush_endio; -+ closure_bio_submit(&bio->bio, cl); -+ } -+} -+ -+static int bch2_inode_flush_nocow_writes(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ bch2_inode_flush_nocow_writes_async(c, inode, &cl); -+ closure_sync(&cl); -+ -+ return 0; -+} -+ -+/* i_size updates: */ -+ -+struct inode_new_size { -+ loff_t new_size; -+ u64 now; -+ unsigned fields; -+}; -+ -+static int inode_set_size(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_new_size *s = p; -+ -+ bi->bi_size = s->new_size; -+ if (s->fields & ATTR_ATIME) -+ bi->bi_atime = s->now; -+ if (s->fields & ATTR_MTIME) -+ bi->bi_mtime = s->now; -+ if (s->fields & ATTR_CTIME) -+ bi->bi_ctime = s->now; -+ -+ return 0; -+} -+ -+int __must_check bch2_write_inode_size(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ loff_t new_size, unsigned fields) -+{ -+ struct inode_new_size s = { -+ .new_size = new_size, -+ .now = bch2_current_time(c), -+ .fields = fields, -+ }; -+ -+ return bch2_write_inode(c, inode, inode_set_size, &s, fields); -+} -+ -+void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, -+ "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", -+ inode->v.i_ino, (u64) inode->v.i_blocks, sectors, -+ inode->ei_inode.bi_sectors); -+ inode->v.i_blocks += sectors; -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ if (quota_res && -+ !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && -+ sectors > 0) { -+ BUG_ON(sectors > quota_res->sectors); -+ BUG_ON(sectors > inode->ei_quota_reserved); -+ -+ quota_res->sectors -= sectors; -+ inode->ei_quota_reserved -= sectors; -+ } else { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); -+ } -+#endif -+} -+ -+/* fsync: */ -+ -+/* -+ * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an -+ * insert trigger: look up the btree inode instead -+ */ -+static int bch2_flush_inode(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ struct bch_inode_unpacked u; -+ int ret; -+ -+ if (c->opts.journal_flush_disabled) -+ return 0; -+ -+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); -+ if (ret) -+ return ret; -+ -+ return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: -+ bch2_inode_flush_nocow_writes(c, inode); -+} -+ -+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret, ret2, ret3; -+ -+ ret = file_write_and_wait_range(file, start, end); -+ ret2 = sync_inode_metadata(&inode->v, 1); -+ ret3 = bch2_flush_inode(c, inode); -+ -+ return bch2_err_class(ret ?: ret2 ?: ret3); -+} -+ -+/* truncate: */ -+ -+static inline int range_has_data(struct bch_fs *c, u32 subvol, -+ struct bpos start, -+ struct bpos end) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) -+ if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { -+ ret = 1; -+ break; -+ } -+ start = iter.pos; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int __bch2_truncate_folio(struct bch_inode_info *inode, -+ pgoff_t index, loff_t start, loff_t end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_folio *s; -+ unsigned start_offset = start & (PAGE_SIZE - 1); -+ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; -+ unsigned i; -+ struct folio *folio; -+ s64 i_sectors_delta = 0; -+ int ret = 0; -+ u64 end_pos; -+ -+ folio = filemap_lock_folio(mapping, index); -+ if (IS_ERR_OR_NULL(folio)) { -+ /* -+ * XXX: we're doing two index lookups when we end up reading the -+ * folio -+ */ -+ ret = range_has_data(c, inode->ei_subvol, -+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), -+ POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); -+ if (ret <= 0) -+ return ret; -+ -+ folio = __filemap_get_folio(mapping, index, -+ FGP_LOCK|FGP_CREAT, GFP_KERNEL); -+ if (IS_ERR_OR_NULL(folio)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ } -+ -+ BUG_ON(start >= folio_end_pos(folio)); -+ BUG_ON(end <= folio_pos(folio)); -+ -+ start_offset = max(start, folio_pos(folio)) - folio_pos(folio); -+ end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); -+ -+ /* Folio boundary? Nothing to do */ -+ if (start_offset == 0 && -+ end_offset == folio_size(folio)) { -+ ret = 0; -+ goto unlock; -+ } -+ -+ s = bch2_folio_create(folio, 0); -+ if (!s) { -+ ret = -ENOMEM; -+ goto unlock; -+ } -+ -+ if (!folio_test_uptodate(folio)) { -+ ret = bch2_read_single_folio(folio, mapping); -+ if (ret) -+ goto unlock; -+ } -+ -+ ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); -+ if (ret) -+ goto unlock; -+ -+ for (i = round_up(start_offset, block_bytes(c)) >> 9; -+ i < round_down(end_offset, block_bytes(c)) >> 9; -+ i++) { -+ s->s[i].nr_replicas = 0; -+ -+ i_sectors_delta -= s->s[i].state == SECTOR_dirty; -+ bch2_folio_sector_set(folio, s, i, SECTOR_unallocated); -+ } -+ -+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ /* -+ * Caller needs to know whether this folio will be written out by -+ * writeback - doing an i_size update if necessary - or whether it will -+ * be responsible for the i_size update. -+ * -+ * Note that we shouldn't ever see a folio beyond EOF, but check and -+ * warn if so. This has been observed by failure to clean up folios -+ * after a short write and there's still a chance reclaim will fix -+ * things up. -+ */ -+ WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); -+ end_pos = folio_end_pos(folio); -+ if (inode->v.i_size > folio_pos(folio)) -+ end_pos = min_t(u64, inode->v.i_size, end_pos); -+ ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; -+ -+ folio_zero_segment(folio, start_offset, end_offset); -+ -+ /* -+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. -+ * -+ * XXX: because we aren't currently tracking whether the folio has actual -+ * data in it (vs. just 0s, or only partially written) this wrong. ick. -+ */ -+ BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); -+ -+ /* -+ * This removes any writeable userspace mappings; we need to force -+ * .page_mkwrite to be called again before any mmapped writes, to -+ * redirty the full page: -+ */ -+ folio_mkclean(folio); -+ filemap_dirty_folio(mapping, folio); -+unlock: -+ folio_unlock(folio); -+ folio_put(folio); -+out: -+ return ret; -+} -+ -+static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) -+{ -+ return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, -+ from, ANYSINT_MAX(loff_t)); -+} -+ -+static int bch2_truncate_folios(struct bch_inode_info *inode, -+ loff_t start, loff_t end) -+{ -+ int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, -+ start, end); -+ -+ if (ret >= 0 && -+ start >> PAGE_SHIFT != end >> PAGE_SHIFT) -+ ret = __bch2_truncate_folio(inode, -+ (end - 1) >> PAGE_SHIFT, -+ start, end); -+ return ret; -+} -+ -+static int bch2_extend(struct mnt_idmap *idmap, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *inode_u, -+ struct iattr *iattr) -+{ -+ struct address_space *mapping = inode->v.i_mapping; -+ int ret; -+ -+ /* -+ * sync appends: -+ * -+ * this has to be done _before_ extending i_size: -+ */ -+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); -+ if (ret) -+ return ret; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ -+ return bch2_setattr_nonsize(idmap, inode, iattr); -+} -+ -+static int bch2_truncate_finish_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ return 0; -+} -+ -+static int bch2_truncate_start_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, void *p) -+{ -+ u64 *new_i_size = p; -+ -+ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_size = *new_i_size; -+ return 0; -+} -+ -+int bch2_truncate(struct mnt_idmap *idmap, -+ struct bch_inode_info *inode, struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_inode_unpacked inode_u; -+ u64 new_i_size = iattr->ia_size; -+ s64 i_sectors_delta = 0; -+ int ret = 0; -+ -+ /* -+ * If the truncate call with change the size of the file, the -+ * cmtimes should be updated. If the size will not change, we -+ * do not need to update the cmtimes. -+ */ -+ if (iattr->ia_size != inode->v.i_size) { -+ if (!(iattr->ia_valid & ATTR_MTIME)) -+ ktime_get_coarse_real_ts64(&iattr->ia_mtime); -+ if (!(iattr->ia_valid & ATTR_CTIME)) -+ ktime_get_coarse_real_ts64(&iattr->ia_ctime); -+ iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; -+ } -+ -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(inode); -+ -+ ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); -+ if (ret) -+ goto err; -+ -+ /* -+ * check this before next assertion; on filesystem error our normal -+ * invariants are a bit broken (truncate has to truncate the page cache -+ * before the inode). -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && -+ inode->v.i_size < inode_u.bi_size, -+ "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", -+ (u64) inode->v.i_size, inode_u.bi_size); -+ -+ if (iattr->ia_size > inode->v.i_size) { -+ ret = bch2_extend(idmap, inode, &inode_u, iattr); -+ goto err; -+ } -+ -+ iattr->ia_valid &= ~ATTR_SIZE; -+ -+ ret = bch2_truncate_folio(inode, iattr->ia_size); -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ /* -+ * When extending, we're going to write the new i_size to disk -+ * immediately so we need to flush anything above the current on disk -+ * i_size first: -+ * -+ * Also, when extending we need to flush the page that i_size currently -+ * straddles - if it's mapped to userspace, we need to ensure that -+ * userspace has to redirty it and call .mkwrite -> set_page_dirty -+ * again to allocate the part of the page that was extended. -+ */ -+ if (iattr->ia_size > inode_u.bi_size) -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, -+ iattr->ia_size - 1); -+ else if (iattr->ia_size & (PAGE_SIZE - 1)) -+ ret = filemap_write_and_wait_range(mapping, -+ round_down(iattr->ia_size, PAGE_SIZE), -+ iattr->ia_size - 1); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, -+ &new_i_size, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ -+ ret = bch2_fpunch(c, inode_inum(inode), -+ round_up(iattr->ia_size, block_bytes(c)) >> 9, -+ U64_MAX, &i_sectors_delta); -+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && -+ !bch2_journal_error(&c->journal), c, -+ "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", -+ inode->v.i_ino, (u64) inode->v.i_blocks, -+ inode->ei_inode.bi_sectors); -+ if (unlikely(ret)) -+ goto err; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ ret = bch2_setattr_nonsize(idmap, inode, iattr); -+err: -+ bch2_pagecache_block_put(inode); -+ return bch2_err_class(ret); -+} -+ -+/* fallocate: */ -+ -+static int inode_update_times_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); -+ return 0; -+} -+ -+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ u64 end = offset + len; -+ u64 block_start = round_up(offset, block_bytes(c)); -+ u64 block_end = round_down(end, block_bytes(c)); -+ bool truncated_last_page; -+ int ret = 0; -+ -+ ret = bch2_truncate_folios(inode, offset, end); -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ truncated_last_page = ret; -+ -+ truncate_pagecache_range(&inode->v, offset, end - 1); -+ -+ if (block_start < block_end) { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode_inum(inode), -+ block_start >> 9, block_end >> 9, -+ &i_sectors_delta); -+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (end >= inode->v.i_size && !truncated_last_page) { -+ ret = bch2_write_inode_size(c, inode, inode->v.i_size, -+ ATTR_MTIME|ATTR_CTIME); -+ } else { -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_MTIME|ATTR_CTIME); -+ } -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ return ret; -+} -+ -+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, -+ loff_t offset, loff_t len, -+ bool insert) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bkey_buf copy; -+ struct btree_trans trans; -+ struct btree_iter src, dst, del; -+ loff_t shift, new_size; -+ u64 src_start; -+ int ret = 0; -+ -+ if ((offset | len) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ if (insert) { -+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) -+ return -EFBIG; -+ -+ if (offset >= inode->v.i_size) -+ return -EINVAL; -+ -+ src_start = U64_MAX; -+ shift = len; -+ } else { -+ if (offset + len >= inode->v.i_size) -+ return -EINVAL; -+ -+ src_start = offset + len; -+ shift = -len; -+ } -+ -+ new_size = inode->v.i_size + shift; -+ -+ ret = bch2_write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); -+ if (ret) -+ return ret; -+ -+ if (insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } else { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode_inum(inode), -+ offset >> 9, (offset + len) >> 9, -+ &i_sectors_delta); -+ bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (ret) -+ return ret; -+ } -+ -+ bch2_bkey_buf_init(©); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, -+ POS(inode->v.i_ino, src_start >> 9), -+ BTREE_ITER_INTENT); -+ bch2_trans_copy_iter(&dst, &src); -+ bch2_trans_copy_iter(&del, &src); -+ -+ while (ret == 0 || -+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ struct bpos next_pos; -+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); -+ struct bpos atomic_end; -+ unsigned trigger_flags = 0; -+ u32 snapshot; -+ -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, -+ inode->ei_subvol, &snapshot); -+ if (ret) -+ continue; -+ -+ bch2_btree_iter_set_snapshot(&src, snapshot); -+ bch2_btree_iter_set_snapshot(&dst, snapshot); -+ bch2_btree_iter_set_snapshot(&del, snapshot); -+ -+ bch2_trans_begin(&trans); -+ -+ k = insert -+ ? bch2_btree_iter_peek_prev(&src) -+ : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); -+ if ((ret = bkey_err(k))) -+ continue; -+ -+ if (!k.k || k.k->p.inode != inode->v.i_ino) -+ break; -+ -+ if (insert && -+ bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) -+ break; -+reassemble: -+ bch2_bkey_buf_reassemble(©, c, k); -+ -+ if (insert && -+ bkey_lt(bkey_start_pos(k.k), move_pos)) -+ bch2_cut_front(move_pos, copy.k); -+ -+ copy.k->k.p.offset += shift >> 9; -+ bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); -+ -+ ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); -+ if (ret) -+ continue; -+ -+ if (!bkey_eq(atomic_end, copy.k->k.p)) { -+ if (insert) { -+ move_pos = atomic_end; -+ move_pos.offset -= shift >> 9; -+ goto reassemble; -+ } else { -+ bch2_cut_back(atomic_end, copy.k); -+ } -+ } -+ -+ bkey_init(&delete.k); -+ delete.k.p = copy.k->k.p; -+ delete.k.size = copy.k->k.size; -+ delete.k.p.offset -= shift >> 9; -+ bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); -+ -+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; -+ -+ if (copy.k->k.size != k.k->size) { -+ /* We might end up splitting compressed extents: */ -+ unsigned nr_ptrs = -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ copy.k->k.size, nr_ptrs, -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ } -+ -+ ret = bch2_btree_iter_traverse(&del) ?: -+ bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: -+ bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: -+ bch2_trans_commit(&trans, &disk_res, NULL, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (!ret) -+ bch2_btree_iter_set_pos(&src, next_pos); -+ } -+ bch2_trans_iter_exit(&trans, &del); -+ bch2_trans_iter_exit(&trans, &dst); -+ bch2_trans_iter_exit(&trans, &src); -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(©, c); -+ -+ if (ret) -+ return ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (!insert) { -+ i_size_write(&inode->v, new_size); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ } else { -+ /* We need an inode update to update bi_journal_seq for fsync: */ -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_MTIME|ATTR_CTIME); -+ } -+ mutex_unlock(&inode->ei_update_lock); -+ return ret; -+} -+ -+static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, -+ u64 start_sector, u64 end_sector) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bpos end_pos = POS(inode->v.i_ino, end_sector); -+ struct bch_io_opts opts; -+ int ret = 0; -+ -+ bch2_inode_opts_get(&opts, c, &inode->ei_inode); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ POS(inode->v.i_ino, start_sector), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (!ret && bkey_lt(iter.pos, end_pos)) { -+ s64 i_sectors_delta = 0; -+ struct quota_res quota_res = { 0 }; -+ struct bkey_s_c k; -+ unsigned sectors; -+ bool is_allocation; -+ u64 hole_start, hole_end; -+ u32 snapshot; -+ -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, -+ inode->ei_subvol, &snapshot); -+ if (ret) -+ goto bkey_err; -+ -+ bch2_btree_iter_set_snapshot(&iter, snapshot); -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ hole_start = iter.pos.offset; -+ hole_end = bpos_min(k.k->p, end_pos).offset; -+ is_allocation = bkey_extent_is_allocation(k.k); -+ -+ /* already reserved */ -+ if (bkey_extent_is_reservation(k) && -+ bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } -+ -+ if (bkey_extent_is_data(k.k) && -+ !(mode & FALLOC_FL_ZERO_RANGE)) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } -+ -+ if (!(mode & FALLOC_FL_ZERO_RANGE)) { -+ /* -+ * Lock ordering - can't be holding btree locks while -+ * blocking on a folio lock: -+ */ -+ if (bch2_clamp_data_hole(&inode->v, -+ &hole_start, -+ &hole_end, -+ opts.data_replicas, true)) -+ ret = drop_locks_do(&trans, -+ (bch2_clamp_data_hole(&inode->v, -+ &hole_start, -+ &hole_end, -+ opts.data_replicas, false), 0)); -+ bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); -+ -+ if (ret) -+ goto bkey_err; -+ -+ if (hole_start == hole_end) -+ continue; -+ } -+ -+ sectors = hole_end - hole_start; -+ -+ if (!is_allocation) { -+ ret = bch2_quota_reservation_add(c, inode, -+ "a_res, sectors, true); -+ if (unlikely(ret)) -+ goto bkey_err; -+ } -+ -+ ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter, -+ sectors, opts, &i_sectors_delta, -+ writepoint_hashed((unsigned long) current)); -+ if (ret) -+ goto bkey_err; -+ -+ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); -+ -+ drop_locks_do(&trans, -+ (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); -+bkey_err: -+ bch2_quota_reservation_put(c, inode, "a_res); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ } -+ -+ if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { -+ struct quota_res quota_res = { 0 }; -+ s64 i_sectors_delta = 0; -+ -+ bch2_fpunch_at(&trans, &iter, inode_inum(inode), -+ end_sector, &i_sectors_delta); -+ bch2_i_sectors_acct(c, inode, "a_res, i_sectors_delta); -+ bch2_quota_reservation_put(c, inode, "a_res); -+ } -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static long bchfs_fallocate(struct bch_inode_info *inode, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ u64 end = offset + len; -+ u64 block_start = round_down(offset, block_bytes(c)); -+ u64 block_end = round_up(end, block_bytes(c)); -+ bool truncated_last_page = false; -+ int ret, ret2 = 0; -+ -+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { -+ ret = inode_newsize_ok(&inode->v, end); -+ if (ret) -+ return ret; -+ } -+ -+ if (mode & FALLOC_FL_ZERO_RANGE) { -+ ret = bch2_truncate_folios(inode, offset, end); -+ if (unlikely(ret < 0)) -+ return ret; -+ -+ truncated_last_page = ret; -+ -+ truncate_pagecache_range(&inode->v, offset, end - 1); -+ -+ block_start = round_up(offset, block_bytes(c)); -+ block_end = round_down(end, block_bytes(c)); -+ } -+ -+ ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); -+ -+ /* -+ * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, -+ * so that the VFS cache i_size is consistent with the btree i_size: -+ */ -+ if (ret && -+ !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) -+ return ret; -+ -+ if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) -+ end = inode->v.i_size; -+ -+ if (end >= inode->v.i_size && -+ (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || -+ !(mode & FALLOC_FL_KEEP_SIZE))) { -+ spin_lock(&inode->v.i_lock); -+ i_size_write(&inode->v, end); -+ spin_unlock(&inode->v.i_lock); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret2 = bch2_write_inode_size(c, inode, end, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+ -+ return ret ?: ret2; -+} -+ -+long bch2_fallocate_dispatch(struct file *file, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ long ret; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) -+ return -EROFS; -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(inode); -+ -+ ret = file_modified(file); -+ if (ret) -+ goto err; -+ -+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) -+ ret = bchfs_fallocate(inode, mode, offset, len); -+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) -+ ret = bchfs_fpunch(inode, offset, len); -+ else if (mode == FALLOC_FL_INSERT_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, true); -+ else if (mode == FALLOC_FL_COLLAPSE_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, false); -+ else -+ ret = -EOPNOTSUPP; -+err: -+ bch2_pagecache_block_put(inode); -+ inode_unlock(&inode->v); -+ bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); -+ -+ return bch2_err_class(ret); -+} -+ -+/* -+ * Take a quota reservation for unallocated blocks in a given file range -+ * Does not check pagecache -+ */ -+static int quota_reserve_range(struct bch_inode_info *inode, -+ struct quota_res *res, -+ u64 start, u64 end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 snapshot; -+ u64 sectors = end - start; -+ u64 pos = start; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ SPOS(inode->v.i_ino, pos, snapshot), 0); -+ -+ while (!(ret = btree_trans_too_many_iters(&trans)) && -+ (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && -+ !(ret = bkey_err(k))) { -+ if (bkey_extent_is_allocation(k.k)) { -+ u64 s = min(end, k.k->p.offset) - -+ max(start, bkey_start_offset(k.k)); -+ BUG_ON(s > sectors); -+ sectors -= s; -+ } -+ bch2_btree_iter_advance(&iter); -+ } -+ pos = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ return ret; -+ -+ return bch2_quota_reservation_add(c, inode, res, sectors, true); -+} -+ -+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, -+ struct file *file_dst, loff_t pos_dst, -+ loff_t len, unsigned remap_flags) -+{ -+ struct bch_inode_info *src = file_bch_inode(file_src); -+ struct bch_inode_info *dst = file_bch_inode(file_dst); -+ struct bch_fs *c = src->v.i_sb->s_fs_info; -+ struct quota_res quota_res = { 0 }; -+ s64 i_sectors_delta = 0; -+ u64 aligned_len; -+ loff_t ret = 0; -+ -+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) -+ return -EINVAL; -+ -+ if (remap_flags & REMAP_FILE_DEDUP) -+ return -EOPNOTSUPP; -+ -+ if ((pos_src & (block_bytes(c) - 1)) || -+ (pos_dst & (block_bytes(c) - 1))) -+ return -EINVAL; -+ -+ if (src == dst && -+ abs(pos_src - pos_dst) < len) -+ return -EINVAL; -+ -+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ inode_dio_wait(&src->v); -+ inode_dio_wait(&dst->v); -+ -+ ret = generic_remap_file_range_prep(file_src, pos_src, -+ file_dst, pos_dst, -+ &len, remap_flags); -+ if (ret < 0 || len == 0) -+ goto err; -+ -+ aligned_len = round_up((u64) len, block_bytes(c)); -+ -+ ret = bch2_write_invalidate_inode_pages_range(dst->v.i_mapping, -+ pos_dst, pos_dst + len - 1); -+ if (ret) -+ goto err; -+ -+ ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, -+ (pos_dst + aligned_len) >> 9); -+ if (ret) -+ goto err; -+ -+ file_update_time(file_dst); -+ -+ bch2_mark_pagecache_unallocated(src, pos_src >> 9, -+ (pos_src + aligned_len) >> 9); -+ -+ ret = bch2_remap_range(c, -+ inode_inum(dst), pos_dst >> 9, -+ inode_inum(src), pos_src >> 9, -+ aligned_len >> 9, -+ pos_dst + len, &i_sectors_delta); -+ if (ret < 0) -+ goto err; -+ -+ /* -+ * due to alignment, we might have remapped slightly more than requsted -+ */ -+ ret = min((u64) ret << 9, (u64) len); -+ -+ bch2_i_sectors_acct(c, dst, "a_res, i_sectors_delta); -+ -+ spin_lock(&dst->v.i_lock); -+ if (pos_dst + ret > dst->v.i_size) -+ i_size_write(&dst->v, pos_dst + ret); -+ spin_unlock(&dst->v.i_lock); -+ -+ if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || -+ IS_SYNC(file_inode(file_dst))) -+ ret = bch2_flush_inode(c, dst); -+err: -+ bch2_quota_reservation_put(c, dst, "a_res); -+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ return bch2_err_class(ret); -+} -+ -+/* fseek: */ -+ -+static loff_t bch2_seek_data(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ subvol_inum inum = inode_inum(inode); -+ u64 isize, next_data = MAX_LFS_FILESIZE; -+ u32 snapshot; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, -+ SPOS(inode->v.i_ino, offset >> 9, snapshot), -+ POS(inode->v.i_ino, U64_MAX), -+ 0, k, ret) { -+ if (bkey_extent_is_data(k.k)) { -+ next_data = max(offset, bkey_start_offset(k.k) << 9); -+ break; -+ } else if (k.k->p.offset >> 9 > isize) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (next_data > offset) -+ next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data, 0, false); -+ -+ if (next_data >= isize) -+ return -ENXIO; -+ -+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -+} -+ -+static loff_t bch2_seek_hole(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ subvol_inum inum = inode_inum(inode); -+ u64 isize, next_hole = MAX_LFS_FILESIZE; -+ u32 snapshot; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, -+ SPOS(inode->v.i_ino, offset >> 9, snapshot), -+ BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE, 0, false); -+ break; -+ } else if (!bkey_extent_is_data(k.k)) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9, 0, false); -+ -+ if (next_hole < k.k->p.offset << 9) -+ break; -+ } else { -+ offset = max(offset, bkey_start_offset(k.k) << 9); -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (next_hole > isize) -+ next_hole = isize; -+ -+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -+} -+ -+loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -+{ -+ loff_t ret; -+ -+ switch (whence) { -+ case SEEK_SET: -+ case SEEK_CUR: -+ case SEEK_END: -+ ret = generic_file_llseek(file, offset, whence); -+ break; -+ case SEEK_DATA: -+ ret = bch2_seek_data(file, offset); -+ break; -+ case SEEK_HOLE: -+ ret = bch2_seek_hole(file, offset); -+ break; -+ default: -+ ret = -EINVAL; -+ break; -+ } -+ -+ return bch2_err_class(ret); -+} -+ -+void bch2_fs_fsio_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->nocow_flush_bioset); -+} -+ -+int bch2_fs_fsio_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->nocow_flush_bioset, -+ 1, offsetof(struct nocow_flush, bio), 0)) -+ return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h -new file mode 100644 -index 000000000..bb5b709fa ---- /dev/null -+++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,184 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_H -+#define _BCACHEFS_FS_IO_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+#include "buckets.h" -+#include "fs.h" -+#include "io_types.h" -+#include "quota.h" -+ -+#include -+ -+struct folio_vec { -+ struct folio *fv_folio; -+ size_t fv_offset; -+ size_t fv_len; -+}; -+ -+static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) -+{ -+ -+ struct folio *folio = page_folio(bv.bv_page); -+ size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + -+ bv.bv_offset; -+ size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); -+ -+ return (struct folio_vec) { -+ .fv_folio = folio, -+ .fv_offset = offset, -+ .fv_len = len, -+ }; -+} -+ -+static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, -+ struct bvec_iter iter) -+{ -+ return biovec_to_foliovec(bio_iter_iovec(bio, iter)); -+} -+ -+#define __bio_for_each_folio(bvl, bio, iter, start) \ -+ for (iter = (start); \ -+ (iter).bi_size && \ -+ ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ -+ bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) -+ -+/** -+ * bio_for_each_folio - iterate over folios within a bio -+ * -+ * Like other non-_all versions, this iterates over what bio->bi_iter currently -+ * points to. This version is for drivers, where the bio may have previously -+ * been split or cloned. -+ */ -+#define bio_for_each_folio(bvl, bio, iter) \ -+ __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) -+ -+struct quota_res { -+ u64 sectors; -+}; -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+static inline void __bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ BUG_ON(res->sectors > inode->ei_quota_reserved); -+ -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); -+ inode->ei_quota_reserved -= res->sectors; -+ res->sectors = 0; -+} -+ -+static inline void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ if (res->sectors) { -+ mutex_lock(&inode->ei_quota_lock); -+ __bch2_quota_reservation_put(c, inode, res); -+ mutex_unlock(&inode->ei_quota_lock); -+ } -+} -+ -+static inline int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ u64 sectors, -+ bool check_enospc) -+{ -+ int ret; -+ -+ if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, -+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); -+ if (likely(!ret)) { -+ inode->ei_quota_reserved += sectors; -+ res->sectors += sectors; -+ } -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+#else -+ -+static inline void __bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) {} -+ -+static inline void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) {} -+ -+static inline int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ return 0; -+} -+ -+#endif -+ -+void __bch2_i_sectors_acct(struct bch_fs *, struct bch_inode_info *, -+ struct quota_res *, s64); -+ -+static inline void bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ if (sectors) { -+ mutex_lock(&inode->ei_quota_lock); -+ __bch2_i_sectors_acct(c, inode, quota_res, sectors); -+ mutex_unlock(&inode->ei_quota_lock); -+ } -+} -+ -+static inline struct address_space *faults_disabled_mapping(void) -+{ -+ return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); -+} -+ -+static inline void set_fdm_dropped_locks(void) -+{ -+ current->faults_disabled_mapping = -+ (void *) (((unsigned long) current->faults_disabled_mapping)|1); -+} -+ -+static inline bool fdm_dropped_locks(void) -+{ -+ return ((unsigned long) current->faults_disabled_mapping) & 1; -+} -+ -+void bch2_inode_flush_nocow_writes_async(struct bch_fs *, -+ struct bch_inode_info *, struct closure *); -+ -+int __must_check bch2_write_inode_size(struct bch_fs *, -+ struct bch_inode_info *, -+ loff_t, unsigned); -+ -+int bch2_fsync(struct file *, loff_t, loff_t, int); -+ -+int bch2_truncate(struct mnt_idmap *, -+ struct bch_inode_info *, struct iattr *); -+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -+ -+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, -+ loff_t, loff_t, unsigned); -+ -+loff_t bch2_llseek(struct file *, loff_t, int); -+ -+void bch2_fs_fsio_exit(struct bch_fs *); -+int bch2_fs_fsio_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_H */ -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -new file mode 100644 -index 000000000..141bcced0 ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,559 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-ioctl.h" -+#include "quota.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -+#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ -+#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ -+#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ -+ -+struct flags_set { -+ unsigned mask; -+ unsigned flags; -+ -+ unsigned projid; -+ -+ bool set_projinherit; -+ bool projinherit; -+}; -+ -+static int bch2_inode_flags_set(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ /* -+ * We're relying on btree locking here for exclusion with other ioctl -+ * calls - use the flags in the btree (@bi), not inode->i_flags: -+ */ -+ struct flags_set *s = p; -+ unsigned newflags = s->flags; -+ unsigned oldflags = bi->bi_flags & s->mask; -+ -+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && -+ !capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ -+ if (!S_ISREG(bi->bi_mode) && -+ !S_ISDIR(bi->bi_mode) && -+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) -+ return -EINVAL; -+ -+ if (s->set_projinherit) { -+ bi->bi_fields_set &= ~(1 << Inode_opt_project); -+ bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); -+ } -+ -+ bi->bi_flags &= ~s->mask; -+ bi->bi_flags |= newflags; -+ -+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -+ return 0; -+} -+ -+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -+{ -+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); -+ -+ return put_user(flags, arg); -+} -+ -+static int bch2_ioc_setflags(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ void __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -+ unsigned uflags; -+ int ret; -+ -+ if (get_user(uflags, (int __user *) arg)) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -+ if (uflags) -+ return -EOPNOTSUPP; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -+ ret = -EACCES; -+ goto setflags_out; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -+ ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+setflags_out: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct fsxattr fa = { 0 }; -+ -+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); -+ -+ if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) -+ fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; -+ -+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -+ -+ return copy_to_user(arg, &fa, sizeof(fa)); -+} -+ -+static int fssetxattr_inode_update_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct flags_set *s = p; -+ -+ if (s->projid != bi->bi_project) { -+ bi->bi_fields_set |= 1U << Inode_opt_project; -+ bi->bi_project = s->projid; -+ } -+ -+ return bch2_inode_flags_set(trans, inode, bi, p); -+} -+ -+static int bch2_ioc_fssetxattr(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -+ struct fsxattr fa; -+ int ret; -+ -+ if (copy_from_user(&fa, arg, sizeof(fa))) -+ return -EFAULT; -+ -+ s.set_projinherit = true; -+ s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; -+ fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; -+ -+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -+ if (fa.fsx_xflags) -+ return -EOPNOTSUPP; -+ -+ if (fa.fsx_projid >= U32_MAX) -+ return -EINVAL; -+ -+ /* -+ * inode fields accessible via the xattr interface are stored with a +1 -+ * bias, so that 0 means unset: -+ */ -+ s.projid = fa.fsx_projid + 1; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { -+ ret = -EACCES; -+ goto err; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_set_projid(c, inode, fa.fsx_projid); -+ if (ret) -+ goto err_unlock; -+ -+ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -+ ATTR_CTIME); -+err_unlock: -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_reinherit_attrs_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_inode_info *dir = p; -+ -+ return !bch2_reinherit_attrs(bi, &dir->ei_inode); -+} -+ -+static int bch2_ioc_reinherit_attrs(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *src, -+ const char __user *name) -+{ -+ struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); -+ struct bch_inode_info *dst; -+ struct inode *vinode = NULL; -+ char *kname = NULL; -+ struct qstr qstr; -+ int ret = 0; -+ subvol_inum inum; -+ -+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); -+ if (!kname) -+ return -ENOMEM; -+ -+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); -+ if (unlikely(ret < 0)) -+ goto err1; -+ -+ qstr.len = ret; -+ qstr.name = kname; -+ -+ ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); -+ if (ret) -+ goto err1; -+ -+ vinode = bch2_vfs_inode_get(c, inum); -+ ret = PTR_ERR_OR_ZERO(vinode); -+ if (ret) -+ goto err1; -+ -+ dst = to_bch_ei(vinode); -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ goto err2; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ if (inode_attr_changing(src, dst, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst, -+ src->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err3; -+ } -+ -+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -+err3: -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ /* return true if we did work */ -+ if (ret >= 0) -+ ret = !ret; -+ -+ mnt_drop_write_file(file); -+err2: -+ iput(vinode); -+err1: -+ kfree(kname); -+ -+ return ret; -+} -+ -+static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) -+{ -+ u32 flags; -+ int ret = 0; -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ if (get_user(flags, arg)) -+ return -EFAULT; -+ -+ bch_notice(c, "shutdown by ioctl type %u", flags); -+ -+ down_write(&c->vfs_sb->s_umount); -+ -+ switch (flags) { -+ case FSOP_GOING_FLAGS_DEFAULT: -+ ret = freeze_bdev(c->vfs_sb->s_bdev); -+ if (ret) -+ goto err; -+ -+ bch2_journal_flush(&c->journal); -+ c->vfs_sb->s_flags |= SB_RDONLY; -+ bch2_fs_emergency_read_only(c); -+ thaw_bdev(c->vfs_sb->s_bdev); -+ break; -+ -+ case FSOP_GOING_FLAGS_LOGFLUSH: -+ bch2_journal_flush(&c->journal); -+ fallthrough; -+ -+ case FSOP_GOING_FLAGS_NOLOGFLUSH: -+ c->vfs_sb->s_flags |= SB_RDONLY; -+ bch2_fs_emergency_read_only(c); -+ break; -+ default: -+ ret = -EINVAL; -+ break; -+ } -+err: -+ up_write(&c->vfs_sb->s_umount); -+ return ret; -+} -+ -+static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, -+ struct bch_ioctl_subvolume arg) -+{ -+ struct inode *dir; -+ struct bch_inode_info *inode; -+ struct user_namespace *s_user_ns; -+ struct dentry *dst_dentry; -+ struct path src_path, dst_path; -+ int how = LOOKUP_FOLLOW; -+ int error; -+ subvol_inum snapshot_src = { 0 }; -+ unsigned lookup_flags = 0; -+ unsigned create_flags = BCH_CREATE_SUBVOL; -+ -+ if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| -+ BCH_SUBVOL_SNAPSHOT_RO)) -+ return -EINVAL; -+ -+ if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && -+ (arg.src_ptr || -+ (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) -+ return -EINVAL; -+ -+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) -+ create_flags |= BCH_CREATE_SNAPSHOT; -+ -+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) -+ create_flags |= BCH_CREATE_SNAPSHOT_RO; -+ -+ /* why do we need this lock? */ -+ down_read(&c->vfs_sb->s_umount); -+ -+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) -+ sync_inodes_sb(c->vfs_sb); -+retry: -+ if (arg.src_ptr) { -+ error = user_path_at(arg.dirfd, -+ (const char __user *)(unsigned long)arg.src_ptr, -+ how, &src_path); -+ if (error) -+ goto err1; -+ -+ if (src_path.dentry->d_sb->s_fs_info != c) { -+ path_put(&src_path); -+ error = -EXDEV; -+ goto err1; -+ } -+ -+ snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); -+ } -+ -+ dst_dentry = user_path_create(arg.dirfd, -+ (const char __user *)(unsigned long)arg.dst_ptr, -+ &dst_path, lookup_flags); -+ error = PTR_ERR_OR_ZERO(dst_dentry); -+ if (error) -+ goto err2; -+ -+ if (dst_dentry->d_sb->s_fs_info != c) { -+ error = -EXDEV; -+ goto err3; -+ } -+ -+ if (dst_dentry->d_inode) { -+ error = -EEXIST; -+ goto err3; -+ } -+ -+ dir = dst_path.dentry->d_inode; -+ if (IS_DEADDIR(dir)) { -+ error = -BCH_ERR_ENOENT_directory_dead; -+ goto err3; -+ } -+ -+ s_user_ns = dir->i_sb->s_user_ns; -+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) || -+ !kgid_has_mapping(s_user_ns, current_fsgid())) { -+ error = -EOVERFLOW; -+ goto err3; -+ } -+ -+ error = inode_permission(file_mnt_idmap(filp), -+ dir, MAY_WRITE | MAY_EXEC); -+ if (error) -+ goto err3; -+ -+ if (!IS_POSIXACL(dir)) -+ arg.mode &= ~current_umask(); -+ -+ error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); -+ if (error) -+ goto err3; -+ -+ if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && -+ !arg.src_ptr) -+ snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; -+ -+ inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), -+ dst_dentry, arg.mode|S_IFDIR, -+ 0, snapshot_src, create_flags); -+ error = PTR_ERR_OR_ZERO(inode); -+ if (error) -+ goto err3; -+ -+ d_instantiate(dst_dentry, &inode->v); -+ fsnotify_mkdir(dir, dst_dentry); -+err3: -+ done_path_create(&dst_path, dst_dentry); -+err2: -+ if (arg.src_ptr) -+ path_put(&src_path); -+ -+ if (retry_estale(error, lookup_flags)) { -+ lookup_flags |= LOOKUP_REVAL; -+ goto retry; -+ } -+err1: -+ up_read(&c->vfs_sb->s_umount); -+ -+ return error; -+} -+ -+static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, -+ struct bch_ioctl_subvolume arg) -+{ -+ struct path path; -+ struct inode *dir; -+ int ret = 0; -+ -+ if (arg.flags) -+ return -EINVAL; -+ -+ ret = user_path_at(arg.dirfd, -+ (const char __user *)(unsigned long)arg.dst_ptr, -+ LOOKUP_FOLLOW, &path); -+ if (ret) -+ return ret; -+ -+ if (path.dentry->d_sb->s_fs_info != c) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ dir = path.dentry->d_parent->d_inode; -+ -+ ret = __bch2_unlink(dir, path.dentry, true); -+ if (ret) -+ goto err; -+ -+ fsnotify_rmdir(dir, path.dentry); -+ d_delete(path.dentry); -+err: -+ path_put(&path); -+ return ret; -+} -+ -+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ long ret; -+ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ ret = bch2_ioc_getflags(inode, (int __user *) arg); -+ break; -+ -+ case FS_IOC_SETFLAGS: -+ ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); -+ break; -+ -+ case FS_IOC_FSGETXATTR: -+ ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); -+ break; -+ -+ case FS_IOC_FSSETXATTR: -+ ret = bch2_ioc_fssetxattr(c, file, inode, -+ (void __user *) arg); -+ break; -+ -+ case BCHFS_IOC_REINHERIT_ATTRS: -+ ret = bch2_ioc_reinherit_attrs(c, file, inode, -+ (void __user *) arg); -+ break; -+ -+ case FS_IOC_GETVERSION: -+ ret = -ENOTTY; -+ break; -+ -+ case FS_IOC_SETVERSION: -+ ret = -ENOTTY; -+ break; -+ -+ case FS_IOC_GOINGDOWN: -+ ret = bch2_ioc_goingdown(c, (u32 __user *) arg); -+ break; -+ -+ case BCH_IOCTL_SUBVOLUME_CREATE: { -+ struct bch_ioctl_subvolume i; -+ -+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) -+ ? -EFAULT -+ : bch2_ioctl_subvolume_create(c, file, i); -+ break; -+ } -+ -+ case BCH_IOCTL_SUBVOLUME_DESTROY: { -+ struct bch_ioctl_subvolume i; -+ -+ ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) -+ ? -EFAULT -+ : bch2_ioctl_subvolume_destroy(c, file, i); -+ break; -+ } -+ -+ default: -+ ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); -+ break; -+ } -+ -+ return bch2_err_class(ret); -+} -+ -+#ifdef CONFIG_COMPAT -+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ /* These are just misnamed, they actually get/put from/to user an int */ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ cmd = FS_IOC_GETFLAGS; -+ break; -+ case FS_IOC32_SETFLAGS: -+ cmd = FS_IOC_SETFLAGS; -+ break; -+ default: -+ return -ENOIOCTLCMD; -+ } -+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -+} -+#endif -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h -new file mode 100644 -index 000000000..f201980ef ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.h -@@ -0,0 +1,81 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IOCTL_H -+#define _BCACHEFS_FS_IOCTL_H -+ -+/* Inode flags: */ -+ -+/* bcachefs inode flags -> vfs inode flags: */ -+static const unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_SYNC] = S_SYNC, -+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, -+ [__BCH_INODE_APPEND] = S_APPEND, -+ [__BCH_INODE_NOATIME] = S_NOATIME, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -+static const unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_SYNC] = FS_SYNC_FL, -+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_APPEND] = FS_APPEND_FL, -+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, -+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -+static const unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, -+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, -+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, -+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -+}; -+ -+#define set_flags(_map, _in, _out) \ -+do { \ -+ unsigned _i; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & (1 << _i)) \ -+ (_out) |= _map[_i]; \ -+ else \ -+ (_out) &= ~_map[_i]; \ -+} while (0) -+ -+#define map_flags(_map, _in) \ -+({ \ -+ unsigned _out = 0; \ -+ \ -+ set_flags(_map, _in, _out); \ -+ _out; \ -+}) -+ -+#define map_flags_rev(_map, _in) \ -+({ \ -+ unsigned _i, _out = 0; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & _map[_i]) { \ -+ (_out) |= 1 << _i; \ -+ (_in) &= ~_map[_i]; \ -+ } \ -+ (_out); \ -+}) -+ -+#define map_defined(_map) \ -+({ \ -+ unsigned _in = ~0; \ -+ \ -+ map_flags_rev(_map, _in); \ -+}) -+ -+/* Set VFS inode flags from bcachefs inode: */ -+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -+{ -+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -+} -+ -+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); -+ -+#endif /* _BCACHEFS_FS_IOCTL_H */ -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -new file mode 100644 -index 000000000..80dcda43e ---- /dev/null -+++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1961 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "bkey_buf.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "errcode.h" -+#include "extents.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-io.h" -+#include "fs-ioctl.h" -+#include "fs-io-buffered.h" -+#include "fs-io-direct.h" -+#include "fs-io-pagecache.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "quota.h" -+#include "snapshot.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct kmem_cache *bch2_inode_cache; -+ -+static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, -+ struct bch_subvolume *); -+ -+void bch2_inode_update_after_write(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ unsigned fields) -+{ -+ struct bch_fs *c = trans->c; -+ -+ BUG_ON(bi->bi_inum != inode->v.i_ino); -+ -+ bch2_assert_pos_locked(trans, BTREE_ID_inodes, -+ POS(0, bi->bi_inum), -+ c->opts.inodes_use_key_cache); -+ -+ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); -+ i_uid_write(&inode->v, bi->bi_uid); -+ i_gid_write(&inode->v, bi->bi_gid); -+ inode->v.i_mode = bi->bi_mode; -+ -+ if (fields & ATTR_ATIME) -+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); -+ if (fields & ATTR_MTIME) -+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); -+ if (fields & ATTR_CTIME) -+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); -+ -+ inode->ei_inode = *bi; -+ -+ bch2_inode_flags_to_vfs(inode); -+} -+ -+int __must_check bch2_write_inode(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ inode_set_fn set, -+ void *p, unsigned fields) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bch_inode_unpacked inode_u; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 512); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), -+ BTREE_ITER_INTENT) ?: -+ (set ? set(&trans, inode, &inode_u, p) : 0) ?: -+ bch2_inode_write(&trans, &iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); -+ -+ /* -+ * the btree node lock protects inode->ei_inode, not ei_update_lock; -+ * this is important for inode updates via bchfs_write_index_update -+ */ -+ if (!ret) -+ bch2_inode_update_after_write(&trans, inode, &inode_u, fields); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, -+ "inode %u:%llu not found when updating", -+ inode_inum(inode).subvol, -+ inode_inum(inode).inum); -+ -+ bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_fs_quota_transfer(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_qid new_qid, -+ unsigned qtypes, -+ enum quota_acct_mode mode) -+{ -+ unsigned i; -+ int ret; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ for (i = 0; i < QTYP_NR; i++) -+ if (new_qid.q[i] == inode->ei_qid.q[i]) -+ qtypes &= ~(1U << i); -+ -+ if (!qtypes) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ -+ ret = bch2_quota_transfer(c, qtypes, new_qid, -+ inode->ei_qid, -+ inode->v.i_blocks + -+ inode->ei_quota_reserved, -+ mode); -+ if (!ret) -+ for (i = 0; i < QTYP_NR; i++) -+ if (qtypes & (1 << i)) -+ inode->ei_qid.q[i] = new_qid.q[i]; -+ -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+static int bch2_iget5_test(struct inode *vinode, void *p) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ subvol_inum *inum = p; -+ -+ return inode->ei_subvol == inum->subvol && -+ inode->ei_inode.bi_inum == inum->inum; -+} -+ -+static int bch2_iget5_set(struct inode *vinode, void *p) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ subvol_inum *inum = p; -+ -+ inode->v.i_ino = inum->inum; -+ inode->ei_subvol = inum->subvol; -+ inode->ei_inode.bi_inum = inum->inum; -+ return 0; -+} -+ -+static unsigned bch2_inode_hash(subvol_inum inum) -+{ -+ return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) -+{ -+ struct bch_inode_unpacked inode_u; -+ struct bch_inode_info *inode; -+ struct btree_trans trans; -+ struct bch_subvolume subvol; -+ int ret; -+ -+ inode = to_bch_ei(iget5_locked(c->vfs_sb, -+ bch2_inode_hash(inum), -+ bch2_iget5_test, -+ bch2_iget5_set, -+ &inum)); -+ if (unlikely(!inode)) -+ return ERR_PTR(-ENOMEM); -+ if (!(inode->v.i_state & I_NEW)) -+ return &inode->v; -+ -+ bch2_trans_init(&trans, c, 8, 0); -+ ret = lockrestart_do(&trans, -+ bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: -+ bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); -+ -+ if (!ret) -+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); -+ bch2_trans_exit(&trans); -+ -+ if (ret) { -+ iget_failed(&inode->v); -+ return ERR_PTR(bch2_err_class(ret)); -+ } -+ -+ mutex_lock(&c->vfs_inodes_lock); -+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); -+ mutex_unlock(&c->vfs_inodes_lock); -+ -+ unlock_new_inode(&inode->v); -+ -+ return &inode->v; -+} -+ -+struct bch_inode_info * -+__bch2_create(struct mnt_idmap *idmap, -+ struct bch_inode_info *dir, struct dentry *dentry, -+ umode_t mode, dev_t rdev, subvol_inum snapshot_src, -+ unsigned flags) -+{ -+ struct bch_fs *c = dir->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u; -+ struct bch_inode_info *inode, *old; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *default_acl = NULL, *acl = NULL; -+ subvol_inum inum; -+ struct bch_subvolume subvol; -+ u64 journal_seq = 0; -+ int ret; -+ -+ /* -+ * preallocate acls + vfs inode before btree transaction, so that -+ * nothing can fail after the transaction succeeds: -+ */ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); -+ if (ret) -+ return ERR_PTR(ret); -+#endif -+ inode = to_bch_ei(new_inode(c->vfs_sb)); -+ if (unlikely(!inode)) { -+ inode = ERR_PTR(-ENOMEM); -+ goto err; -+ } -+ -+ bch2_inode_init_early(c, &inode_u); -+ -+ if (!(flags & BCH_CREATE_TMPFILE)) -+ mutex_lock(&dir->ei_update_lock); -+ -+ bch2_trans_init(&trans, c, 8, -+ 2048 + (!(flags & BCH_CREATE_TMPFILE) -+ ? dentry->d_name.len : 0)); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_create_trans(&trans, -+ inode_inum(dir), &dir_u, &inode_u, -+ !(flags & BCH_CREATE_TMPFILE) -+ ? &dentry->d_name : NULL, -+ from_kuid(i_user_ns(&dir->v), current_fsuid()), -+ from_kgid(i_user_ns(&dir->v), current_fsgid()), -+ mode, rdev, -+ default_acl, acl, snapshot_src, flags) ?: -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (unlikely(ret)) -+ goto err_before_quota; -+ -+ inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; -+ inum.inum = inode_u.bi_inum; -+ -+ ret = bch2_subvolume_get(&trans, inum.subvol, true, -+ BTREE_ITER_WITH_UPDATES, &subvol) ?: -+ bch2_trans_commit(&trans, NULL, &journal_seq, 0); -+ if (unlikely(ret)) { -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+err_before_quota: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ goto err_trans; -+ } -+ -+ if (!(flags & BCH_CREATE_TMPFILE)) { -+ bch2_inode_update_after_write(&trans, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&dir->ei_update_lock); -+ } -+ -+ bch2_iget5_set(&inode->v, &inum); -+ bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); -+ -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); -+ -+ /* -+ * we must insert the new inode into the inode cache before calling -+ * bch2_trans_exit() and dropping locks, else we could race with another -+ * thread pulling the inode in and modifying it: -+ */ -+ -+ inode->v.i_state |= I_CREATING; -+ -+ old = to_bch_ei(inode_insert5(&inode->v, -+ bch2_inode_hash(inum), -+ bch2_iget5_test, -+ bch2_iget5_set, -+ &inum)); -+ BUG_ON(!old); -+ -+ if (unlikely(old != inode)) { -+ /* -+ * We raced, another process pulled the new inode into cache -+ * before us: -+ */ -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ -+ inode = old; -+ } else { -+ mutex_lock(&c->vfs_inodes_lock); -+ list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); -+ mutex_unlock(&c->vfs_inodes_lock); -+ /* -+ * we really don't want insert_inode_locked2() to be setting -+ * I_NEW... -+ */ -+ unlock_new_inode(&inode->v); -+ } -+ -+ bch2_trans_exit(&trans); -+err: -+ posix_acl_release(default_acl); -+ posix_acl_release(acl); -+ return inode; -+err_trans: -+ if (!(flags & BCH_CREATE_TMPFILE)) -+ mutex_unlock(&dir->ei_update_lock); -+ -+ bch2_trans_exit(&trans); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ inode = ERR_PTR(ret); -+ goto err; -+} -+ -+/* methods */ -+ -+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, -+ unsigned int flags) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); -+ struct inode *vinode = NULL; -+ subvol_inum inum = { .subvol = 1 }; -+ int ret; -+ -+ ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, -+ &dentry->d_name, &inum); -+ -+ if (!ret) -+ vinode = bch2_vfs_inode_get(c, inum); -+ -+ return d_splice_alias(vinode, dentry); -+} -+ -+static int bch2_mknod(struct mnt_idmap *idmap, -+ struct inode *vdir, struct dentry *dentry, -+ umode_t mode, dev_t rdev) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, -+ (subvol_inum) { 0 }, 0); -+ -+ if (IS_ERR(inode)) -+ return bch2_err_class(PTR_ERR(inode)); -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_create(struct mnt_idmap *idmap, -+ struct inode *vdir, struct dentry *dentry, -+ umode_t mode, bool excl) -+{ -+ return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); -+} -+ -+static int __bch2_link(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_info *dir, -+ struct dentry *dentry) -+{ -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u, inode_u; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_link_trans(&trans, -+ inode_inum(dir), &dir_u, -+ inode_inum(inode), &inode_u, -+ &dentry->d_name)); -+ -+ if (likely(!ret)) { -+ bch2_inode_update_after_write(&trans, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ return ret; -+} -+ -+static int bch2_link(struct dentry *old_dentry, struct inode *vdir, -+ struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ return ret; -+ -+ ihold(&inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+int __bch2_unlink(struct inode *vdir, struct dentry *dentry, -+ bool deleting_snapshot) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_inode_unpacked dir_u, inode_u; -+ struct btree_trans trans; -+ int ret; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ ret = commit_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL, -+ bch2_unlink_trans(&trans, -+ inode_inum(dir), &dir_u, -+ &inode_u, &dentry->d_name, -+ deleting_snapshot)); -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_inode_update_after_write(&trans, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(&trans, inode, &inode_u, -+ ATTR_MTIME); -+ -+ if (inode_u.bi_subvol) { -+ /* -+ * Subvolume deletion is asynchronous, but we still want to tell -+ * the VFS that it's been deleted here: -+ */ -+ set_nlink(&inode->v, 0); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ -+ return ret; -+} -+ -+static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -+{ -+ return __bch2_unlink(vdir, dentry, false); -+} -+ -+static int bch2_symlink(struct mnt_idmap *idmap, -+ struct inode *vdir, struct dentry *dentry, -+ const char *symname) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; -+ int ret; -+ -+ inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, -+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); -+ if (IS_ERR(inode)) -+ return bch2_err_class(PTR_ERR(inode)); -+ -+ inode_lock(&inode->v); -+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); -+ inode_unlock(&inode->v); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ goto err; -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+err: -+ iput(&inode->v); -+ return ret; -+} -+ -+static int bch2_mkdir(struct mnt_idmap *idmap, -+ struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); -+} -+ -+static int bch2_rename2(struct mnt_idmap *idmap, -+ struct inode *src_vdir, struct dentry *src_dentry, -+ struct inode *dst_vdir, struct dentry *dst_dentry, -+ unsigned flags) -+{ -+ struct bch_fs *c = src_vdir->i_sb->s_fs_info; -+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); -+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); -+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); -+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); -+ struct bch_inode_unpacked dst_dir_u, src_dir_u; -+ struct bch_inode_unpacked src_inode_u, dst_inode_u; -+ struct btree_trans trans; -+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE -+ ? BCH_RENAME_EXCHANGE -+ : dst_dentry->d_inode -+ ? BCH_RENAME_OVERWRITE : BCH_RENAME; -+ int ret; -+ -+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) -+ return -EINVAL; -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, -+ 0, LLONG_MAX); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 8, 2048); -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, src_inode, -+ dst_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst_inode, -+ src_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_rename_trans(&trans, -+ inode_inum(src_dir), &src_dir_u, -+ inode_inum(dst_dir), &dst_dir_u, -+ &src_inode_u, -+ &dst_inode_u, -+ &src_dentry->d_name, -+ &dst_dentry->d_name, -+ mode)); -+ if (unlikely(ret)) -+ goto err; -+ -+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); -+ BUG_ON(dst_inode && -+ dst_inode->v.i_ino != dst_inode_u.bi_inum); -+ -+ bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ -+ if (src_dir != dst_dir) -+ bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ -+ bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, -+ ATTR_CTIME); -+ -+ if (dst_inode) -+ bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, -+ ATTR_CTIME); -+err: -+ bch2_trans_exit(&trans); -+ -+ bch2_fs_quota_transfer(c, src_inode, -+ bch_qid(&src_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ if (dst_inode) -+ bch2_fs_quota_transfer(c, dst_inode, -+ bch_qid(&dst_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ return ret; -+} -+ -+static void bch2_setattr_copy(struct mnt_idmap *idmap, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ unsigned int ia_valid = attr->ia_valid; -+ -+ if (ia_valid & ATTR_UID) -+ bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); -+ if (ia_valid & ATTR_GID) -+ bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); -+ -+ if (ia_valid & ATTR_SIZE) -+ bi->bi_size = attr->ia_size; -+ -+ if (ia_valid & ATTR_ATIME) -+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); -+ if (ia_valid & ATTR_MTIME) -+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); -+ if (ia_valid & ATTR_CTIME) -+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); -+ -+ if (ia_valid & ATTR_MODE) { -+ umode_t mode = attr->ia_mode; -+ kgid_t gid = ia_valid & ATTR_GID -+ ? attr->ia_gid -+ : inode->v.i_gid; -+ -+ if (!in_group_p(gid) && -+ !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) -+ mode &= ~S_ISGID; -+ bi->bi_mode = mode; -+ } -+} -+ -+int bch2_setattr_nonsize(struct mnt_idmap *idmap, -+ struct bch_inode_info *inode, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_qid qid; -+ struct btree_trans trans; -+ struct btree_iter inode_iter = { NULL }; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl = NULL; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ -+ qid = inode->ei_qid; -+ -+ if (attr->ia_valid & ATTR_UID) -+ qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); -+ -+ if (attr->ia_valid & ATTR_GID) -+ qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); -+ -+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ kfree(acl); -+ acl = NULL; -+ -+ ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), -+ BTREE_ITER_INTENT); -+ if (ret) -+ goto btree_err; -+ -+ bch2_setattr_copy(idmap, inode, &inode_u, attr); -+ -+ if (attr->ia_valid & ATTR_MODE) { -+ ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, -+ inode_u.bi_mode, &acl); -+ if (ret) -+ goto btree_err; -+ } -+ -+ ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+btree_err: -+ bch2_trans_iter_exit(&trans, &inode_iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ if (unlikely(ret)) -+ goto err_trans; -+ -+ bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); -+ -+ if (acl) -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+err_trans: -+ bch2_trans_exit(&trans); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return bch2_err_class(ret); -+} -+ -+static int bch2_getattr(struct mnt_idmap *idmap, -+ const struct path *path, struct kstat *stat, -+ u32 request_mask, unsigned query_flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ stat->dev = inode->v.i_sb->s_dev; -+ stat->ino = inode->v.i_ino; -+ stat->mode = inode->v.i_mode; -+ stat->nlink = inode->v.i_nlink; -+ stat->uid = inode->v.i_uid; -+ stat->gid = inode->v.i_gid; -+ stat->rdev = inode->v.i_rdev; -+ stat->size = i_size_read(&inode->v); -+ stat->atime = inode->v.i_atime; -+ stat->mtime = inode->v.i_mtime; -+ stat->ctime = inode->v.i_ctime; -+ stat->blksize = block_bytes(c); -+ stat->blocks = inode->v.i_blocks; -+ -+ if (request_mask & STATX_BTIME) { -+ stat->result_mask |= STATX_BTIME; -+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); -+ } -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) -+ stat->attributes |= STATX_ATTR_IMMUTABLE; -+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) -+ stat->attributes |= STATX_ATTR_APPEND; -+ stat->attributes_mask |= STATX_ATTR_APPEND; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) -+ stat->attributes |= STATX_ATTR_NODUMP; -+ stat->attributes_mask |= STATX_ATTR_NODUMP; -+ -+ return 0; -+} -+ -+static int bch2_setattr(struct mnt_idmap *idmap, -+ struct dentry *dentry, struct iattr *iattr) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = setattr_prepare(idmap, dentry, iattr); -+ if (ret) -+ return ret; -+ -+ return iattr->ia_valid & ATTR_SIZE -+ ? bch2_truncate(idmap, inode, iattr) -+ : bch2_setattr_nonsize(idmap, inode, iattr); -+} -+ -+static int bch2_tmpfile(struct mnt_idmap *idmap, -+ struct inode *vdir, struct file *file, umode_t mode) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(idmap, to_bch_ei(vdir), -+ file->f_path.dentry, mode, 0, -+ (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); -+ -+ if (IS_ERR(inode)) -+ return bch2_err_class(PTR_ERR(inode)); -+ -+ d_mark_tmpfile(file, &inode->v); -+ d_instantiate(file->f_path.dentry, &inode->v); -+ return finish_open_simple(file, 0); -+} -+ -+static int bch2_fill_extent(struct bch_fs *c, -+ struct fiemap_extent_info *info, -+ struct bkey_s_c k, unsigned flags) -+{ -+ if (bkey_extent_is_direct_data(k.k)) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ flags |= FIEMAP_EXTENT_SHARED; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int flags2 = 0; -+ u64 offset = p.ptr.offset; -+ -+ if (p.ptr.unwritten) -+ flags2 |= FIEMAP_EXTENT_UNWRITTEN; -+ -+ if (p.crc.compression_type) -+ flags2 |= FIEMAP_EXTENT_ENCODED; -+ else -+ offset += p.crc.offset; -+ -+ if ((offset & (block_sectors(c) - 1)) || -+ (k.k->size & (block_sectors(c) - 1))) -+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; -+ -+ ret = fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ offset << 9, -+ k.k->size << 9, flags|flags2); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+ } else if (bkey_extent_is_inline_data(k.k)) { -+ return fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ 0, k.k->size << 9, -+ flags| -+ FIEMAP_EXTENT_DATA_INLINE); -+ } else if (k.k->type == KEY_TYPE_reservation) { -+ return fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ 0, k.k->size << 9, -+ flags| -+ FIEMAP_EXTENT_DELALLOC| -+ FIEMAP_EXTENT_UNWRITTEN); -+ } else { -+ BUG(); -+ } -+} -+ -+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -+ u64 start, u64 len) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *ei = to_bch_ei(vinode); -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_buf cur, prev; -+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -+ unsigned offset_into_extent, sectors; -+ bool have_extent = false; -+ u32 snapshot; -+ int ret = 0; -+ -+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); -+ if (ret) -+ return ret; -+ -+ if (start + len < start) -+ return -EINVAL; -+ -+ start >>= 9; -+ -+ bch2_bkey_buf_init(&cur); -+ bch2_bkey_buf_init(&prev); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ SPOS(ei->v.i_ino, start, snapshot), 0); -+ -+ while (!(ret = btree_trans_too_many_iters(&trans)) && -+ (k = bch2_btree_iter_peek_upto(&iter, end)).k && -+ !(ret = bkey_err(k))) { -+ enum btree_id data_btree = BTREE_ID_extents; -+ -+ if (!bkey_extent_is_data(k.k) && -+ k.k->type != KEY_TYPE_reservation) { -+ bch2_btree_iter_advance(&iter); -+ continue; -+ } -+ -+ offset_into_extent = iter.pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bch2_bkey_buf_reassemble(&cur, c, k); -+ -+ ret = bch2_read_indirect_extent(&trans, &data_btree, -+ &offset_into_extent, &cur); -+ if (ret) -+ break; -+ -+ k = bkey_i_to_s_c(cur.k); -+ bch2_bkey_buf_realloc(&prev, c, k.k->u64s); -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_cut_front(POS(k.k->p.inode, -+ bkey_start_offset(k.k) + -+ offset_into_extent), -+ cur.k); -+ bch2_key_resize(&cur.k->k, sectors); -+ cur.k->k.p = iter.pos; -+ cur.k->k.p.offset += cur.k->k.size; -+ -+ if (have_extent) { -+ bch2_trans_unlock(&trans); -+ ret = bch2_fill_extent(c, info, -+ bkey_i_to_s_c(prev.k), 0); -+ if (ret) -+ break; -+ } -+ -+ bkey_copy(prev.k, cur.k); -+ have_extent = true; -+ -+ bch2_btree_iter_set_pos(&iter, -+ POS(iter.pos.inode, iter.pos.offset + sectors)); -+ } -+ start = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ if (!ret && have_extent) { -+ bch2_trans_unlock(&trans); -+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -+ FIEMAP_EXTENT_LAST); -+ } -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&cur, c); -+ bch2_bkey_buf_exit(&prev, c); -+ return ret < 0 ? ret : 0; -+} -+ -+static const struct vm_operations_struct bch_vm_ops = { -+ .fault = bch2_page_fault, -+ .map_pages = filemap_map_pages, -+ .page_mkwrite = bch2_page_mkwrite, -+}; -+ -+static int bch2_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ file_accessed(file); -+ -+ vma->vm_ops = &bch_vm_ops; -+ return 0; -+} -+ -+/* Directories: */ -+ -+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -+{ -+ return generic_file_llseek_size(file, offset, whence, -+ S64_MAX, S64_MAX); -+} -+ -+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret; -+ -+ if (!dir_emit_dots(file, ctx)) -+ return 0; -+ -+ ret = bch2_readdir(c, inode_inum(inode), ctx); -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ return bch2_err_class(ret); -+} -+ -+static const struct file_operations bch_file_operations = { -+ .llseek = bch2_llseek, -+ .read_iter = bch2_read_iter, -+ .write_iter = bch2_write_iter, -+ .mmap = bch2_mmap, -+ .open = generic_file_open, -+ .fsync = bch2_fsync, -+ .splice_read = filemap_splice_read, -+ .splice_write = iter_file_splice_write, -+ .fallocate = bch2_fallocate_dispatch, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+ .remap_file_range = bch2_remap_file_range, -+}; -+ -+static const struct inode_operations bch_file_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .fiemap = bch2_fiemap, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_dir_inode_operations = { -+ .lookup = bch2_lookup, -+ .create = bch2_create, -+ .link = bch2_link, -+ .unlink = bch2_unlink, -+ .symlink = bch2_symlink, -+ .mkdir = bch2_mkdir, -+ .rmdir = bch2_unlink, -+ .mknod = bch2_mknod, -+ .rename = bch2_rename2, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .tmpfile = bch2_tmpfile, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct file_operations bch_dir_file_operations = { -+ .llseek = bch2_dir_llseek, -+ .read = generic_read_dir, -+ .iterate_shared = bch2_vfs_readdir, -+ .fsync = bch2_fsync, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+}; -+ -+static const struct inode_operations bch_symlink_inode_operations = { -+ .get_link = page_get_link, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_special_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct address_space_operations bch_address_space_operations = { -+ .read_folio = bch2_read_folio, -+ .writepages = bch2_writepages, -+ .readahead = bch2_readahead, -+ .dirty_folio = filemap_dirty_folio, -+ .write_begin = bch2_write_begin, -+ .write_end = bch2_write_end, -+ .invalidate_folio = bch2_invalidate_folio, -+ .release_folio = bch2_release_folio, -+ .direct_IO = noop_direct_IO, -+#ifdef CONFIG_MIGRATION -+ .migrate_folio = filemap_migrate_folio, -+#endif -+ .error_remove_page = generic_error_remove_page, -+}; -+ -+struct bcachefs_fid { -+ u64 inum; -+ u32 subvol; -+ u32 gen; -+} __packed; -+ -+struct bcachefs_fid_with_parent { -+ struct bcachefs_fid fid; -+ struct bcachefs_fid dir; -+} __packed; -+ -+static int bcachefs_fid_valid(int fh_len, int fh_type) -+{ -+ switch (fh_type) { -+ case FILEID_BCACHEFS_WITHOUT_PARENT: -+ return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); -+ case FILEID_BCACHEFS_WITH_PARENT: -+ return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); -+ default: -+ return false; -+ } -+} -+ -+static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) -+{ -+ return (struct bcachefs_fid) { -+ .inum = inode->ei_inode.bi_inum, -+ .subvol = inode->ei_subvol, -+ .gen = inode->ei_inode.bi_generation, -+ }; -+} -+ -+static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, -+ struct inode *vdir) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ -+ if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) -+ return FILEID_INVALID; -+ -+ if (!S_ISDIR(inode->v.i_mode) && dir) { -+ struct bcachefs_fid_with_parent *fid = (void *) fh; -+ -+ fid->fid = bch2_inode_to_fid(inode); -+ fid->dir = bch2_inode_to_fid(dir); -+ -+ *len = sizeof(*fid) / sizeof(u32); -+ return FILEID_BCACHEFS_WITH_PARENT; -+ } else { -+ struct bcachefs_fid *fid = (void *) fh; -+ -+ *fid = bch2_inode_to_fid(inode); -+ -+ *len = sizeof(*fid) / sizeof(u32); -+ return FILEID_BCACHEFS_WITHOUT_PARENT; -+ } -+} -+ -+static struct inode *bch2_nfs_get_inode(struct super_block *sb, -+ struct bcachefs_fid fid) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { -+ .subvol = fid.subvol, -+ .inum = fid.inum, -+ }); -+ if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { -+ iput(vinode); -+ vinode = ERR_PTR(-ESTALE); -+ } -+ return vinode; -+} -+ -+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, -+ int fh_len, int fh_type) -+{ -+ struct bcachefs_fid *fid = (void *) _fid; -+ -+ if (!bcachefs_fid_valid(fh_len, fh_type)) -+ return NULL; -+ -+ return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); -+} -+ -+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, -+ int fh_len, int fh_type) -+{ -+ struct bcachefs_fid_with_parent *fid = (void *) _fid; -+ -+ if (!bcachefs_fid_valid(fh_len, fh_type) || -+ fh_type != FILEID_BCACHEFS_WITH_PARENT) -+ return NULL; -+ -+ return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); -+} -+ -+static struct dentry *bch2_get_parent(struct dentry *child) -+{ -+ struct bch_inode_info *inode = to_bch_ei(child->d_inode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ subvol_inum parent_inum = { -+ .subvol = inode->ei_inode.bi_parent_subvol ?: -+ inode->ei_subvol, -+ .inum = inode->ei_inode.bi_dir, -+ }; -+ -+ if (!parent_inum.inum) -+ return NULL; -+ -+ return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); -+} -+ -+static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) -+{ -+ struct bch_inode_info *inode = to_bch_ei(child->d_inode); -+ struct bch_inode_info *dir = to_bch_ei(parent->d_inode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter iter1; -+ struct btree_iter iter2; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ struct bch_inode_unpacked inode_u; -+ subvol_inum target; -+ u32 snapshot; -+ struct qstr dirent_name; -+ unsigned name_len = 0; -+ int ret; -+ -+ if (!S_ISDIR(dir->v.i_mode)) -+ return -EINVAL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, -+ POS(dir->ei_inode.bi_inum, 0), 0); -+ bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, -+ POS(dir->ei_inode.bi_inum, 0), 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_btree_iter_set_snapshot(&iter1, snapshot); -+ bch2_btree_iter_set_snapshot(&iter2, snapshot); -+ -+ ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); -+ if (ret) -+ goto err; -+ -+ if (inode_u.bi_dir == dir->ei_inode.bi_inum) { -+ bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); -+ -+ k = bch2_btree_iter_peek_slot(&iter1); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_dirent) { -+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; -+ goto err; -+ } -+ -+ d = bkey_s_c_to_dirent(k); -+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); -+ if (ret > 0) -+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; -+ if (ret) -+ goto err; -+ -+ if (target.subvol == inode->ei_subvol && -+ target.inum == inode->ei_inode.bi_inum) -+ goto found; -+ } else { -+ /* -+ * File with multiple hardlinks and our backref is to the wrong -+ * directory - linear search: -+ */ -+ for_each_btree_key_continue_norestart(iter2, 0, k, ret) { -+ if (k.k->p.inode > dir->ei_inode.bi_inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ d = bkey_s_c_to_dirent(k); -+ ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); -+ if (ret < 0) -+ break; -+ if (ret) -+ continue; -+ -+ if (target.subvol == inode->ei_subvol && -+ target.inum == inode->ei_inode.bi_inum) -+ goto found; -+ } -+ } -+ -+ ret = -ENOENT; -+ goto err; -+found: -+ dirent_name = bch2_dirent_get_name(d); -+ -+ name_len = min_t(unsigned, dirent_name.len, NAME_MAX); -+ memcpy(name, dirent_name.name, name_len); -+ name[name_len] = '\0'; -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_iter_exit(&trans, &iter1); -+ bch2_trans_iter_exit(&trans, &iter2); -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static const struct export_operations bch_export_ops = { -+ .encode_fh = bch2_encode_fh, -+ .fh_to_dentry = bch2_fh_to_dentry, -+ .fh_to_parent = bch2_fh_to_parent, -+ .get_parent = bch2_get_parent, -+ .get_name = bch2_get_name, -+}; -+ -+static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ struct bch_subvolume *subvol) -+{ -+ bch2_inode_update_after_write(trans, inode, bi, ~0); -+ -+ if (BCH_SUBVOLUME_SNAP(subvol)) -+ set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); -+ else -+ clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); -+ -+ inode->v.i_blocks = bi->bi_sectors; -+ inode->v.i_ino = bi->bi_inum; -+ inode->v.i_rdev = bi->bi_dev; -+ inode->v.i_generation = bi->bi_generation; -+ inode->v.i_size = bi->bi_size; -+ -+ inode->ei_flags = 0; -+ inode->ei_quota_reserved = 0; -+ inode->ei_qid = bch_qid(bi); -+ inode->ei_subvol = inum.subvol; -+ -+ inode->v.i_mapping->a_ops = &bch_address_space_operations; -+ -+ switch (inode->v.i_mode & S_IFMT) { -+ case S_IFREG: -+ inode->v.i_op = &bch_file_inode_operations; -+ inode->v.i_fop = &bch_file_operations; -+ break; -+ case S_IFDIR: -+ inode->v.i_op = &bch_dir_inode_operations; -+ inode->v.i_fop = &bch_dir_file_operations; -+ break; -+ case S_IFLNK: -+ inode_nohighmem(&inode->v); -+ inode->v.i_op = &bch_symlink_inode_operations; -+ break; -+ default: -+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); -+ inode->v.i_op = &bch_special_inode_operations; -+ break; -+ } -+ -+ mapping_set_large_folios(inode->v.i_mapping); -+} -+ -+static struct inode *bch2_alloc_inode(struct super_block *sb) -+{ -+ struct bch_inode_info *inode; -+ -+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); -+ if (!inode) -+ return NULL; -+ -+ inode_init_once(&inode->v); -+ mutex_init(&inode->ei_update_lock); -+ two_state_lock_init(&inode->ei_pagecache_lock); -+ INIT_LIST_HEAD(&inode->ei_vfs_inode_list); -+ mutex_init(&inode->ei_quota_lock); -+ -+ return &inode->v; -+} -+ -+static void bch2_i_callback(struct rcu_head *head) -+{ -+ struct inode *vinode = container_of(head, struct inode, i_rcu); -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ kmem_cache_free(bch2_inode_cache, inode); -+} -+ -+static void bch2_destroy_inode(struct inode *vinode) -+{ -+ call_rcu(&vinode->i_rcu, bch2_i_callback); -+} -+ -+static int inode_update_times_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); -+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); -+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); -+ -+ return 0; -+} -+ -+static int bch2_vfs_write_inode(struct inode *vinode, -+ struct writeback_control *wbc) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return bch2_err_class(ret); -+} -+ -+static void bch2_evict_inode(struct inode *vinode) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ truncate_inode_pages_final(&inode->v.i_data); -+ -+ clear_inode(&inode->v); -+ -+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); -+ -+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), -+ KEY_TYPE_QUOTA_WARN); -+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode_inum(inode)); -+ } -+ -+ mutex_lock(&c->vfs_inodes_lock); -+ list_del_init(&inode->ei_vfs_inode_list); -+ mutex_unlock(&c->vfs_inodes_lock); -+} -+ -+void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) -+{ -+ struct bch_inode_info *inode, **i; -+ DARRAY(struct bch_inode_info *) grabbed; -+ bool clean_pass = false, this_pass_clean; -+ -+ /* -+ * Initially, we scan for inodes without I_DONTCACHE, then mark them to -+ * be pruned with d_mark_dontcache(). -+ * -+ * Once we've had a clean pass where we didn't find any inodes without -+ * I_DONTCACHE, we wait for them to be freed: -+ */ -+ -+ darray_init(&grabbed); -+ darray_make_room(&grabbed, 1024); -+again: -+ cond_resched(); -+ this_pass_clean = true; -+ -+ mutex_lock(&c->vfs_inodes_lock); -+ list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { -+ if (!snapshot_list_has_id(s, inode->ei_subvol)) -+ continue; -+ -+ if (!(inode->v.i_state & I_DONTCACHE) && -+ !(inode->v.i_state & I_FREEING) && -+ igrab(&inode->v)) { -+ this_pass_clean = false; -+ -+ if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { -+ iput(&inode->v); -+ break; -+ } -+ } else if (clean_pass && this_pass_clean) { -+ wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); -+ DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); -+ -+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); -+ mutex_unlock(&c->vfs_inodes_lock); -+ -+ schedule(); -+ finish_wait(wq, &wait.wq_entry); -+ goto again; -+ } -+ } -+ mutex_unlock(&c->vfs_inodes_lock); -+ -+ darray_for_each(grabbed, i) { -+ inode = *i; -+ d_mark_dontcache(&inode->v); -+ d_prune_aliases(&inode->v); -+ iput(&inode->v); -+ } -+ grabbed.nr = 0; -+ -+ if (!clean_pass || !this_pass_clean) { -+ clean_pass = this_pass_clean; -+ goto again; -+ } -+ -+ darray_exit(&grabbed); -+} -+ -+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -+{ -+ struct super_block *sb = dentry->d_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); -+ unsigned shift = sb->s_blocksize_bits - 9; -+ /* -+ * this assumes inodes take up 64 bytes, which is a decent average -+ * number: -+ */ -+ u64 avail_inodes = ((usage.capacity - usage.used) << 3); -+ u64 fsid; -+ -+ buf->f_type = BCACHEFS_STATFS_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = usage.capacity >> shift; -+ buf->f_bfree = usage.free >> shift; -+ buf->f_bavail = avail_factor(usage.free) >> shift; -+ -+ buf->f_files = usage.nr_inodes + avail_inodes; -+ buf->f_ffree = avail_inodes; -+ -+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ -+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); -+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; -+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; -+ buf->f_namelen = BCH_NAME_MAX; -+ -+ return 0; -+} -+ -+static int bch2_sync_fs(struct super_block *sb, int wait) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ int ret; -+ -+ if (c->opts.journal_flush_disabled) -+ return 0; -+ -+ if (!wait) { -+ bch2_journal_flush_async(&c->journal, NULL); -+ return 0; -+ } -+ -+ ret = bch2_journal_flush(&c->journal); -+ return bch2_err_class(ret); -+} -+ -+static struct bch_fs *bch2_path_to_fs(const char *path) -+{ -+ struct bch_fs *c; -+ dev_t dev; -+ int ret; -+ -+ ret = lookup_bdev(path, &dev); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ c = bch2_dev_to_fs(dev); -+ if (c) -+ closure_put(&c->cl); -+ return c ?: ERR_PTR(-ENOENT); -+} -+ -+static char **split_devs(const char *_dev_name, unsigned *nr) -+{ -+ char *dev_name = NULL, **devs = NULL, *s; -+ size_t i, nr_devs = 0; -+ -+ dev_name = kstrdup(_dev_name, GFP_KERNEL); -+ if (!dev_name) -+ return NULL; -+ -+ for (s = dev_name; s; s = strchr(s + 1, ':')) -+ nr_devs++; -+ -+ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); -+ if (!devs) { -+ kfree(dev_name); -+ return NULL; -+ } -+ -+ for (i = 0, s = dev_name; -+ s; -+ (s = strchr(s, ':')) && (*s++ = '\0')) -+ devs[i++] = s; -+ -+ *nr = nr_devs; -+ return devs; -+} -+ -+static int bch2_remount(struct super_block *sb, int *flags, char *data) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_opts opts = bch2_opts_empty(); -+ int ret; -+ -+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(c, &opts, data); -+ if (ret) -+ goto err; -+ -+ if (opts.read_only != c->opts.read_only) { -+ down_write(&c->state_lock); -+ -+ if (opts.read_only) { -+ bch2_fs_read_only(c); -+ -+ sb->s_flags |= SB_RDONLY; -+ } else { -+ ret = bch2_fs_read_write(c); -+ if (ret) { -+ bch_err(c, "error going rw: %i", ret); -+ up_write(&c->state_lock); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ sb->s_flags &= ~SB_RDONLY; -+ } -+ -+ c->opts.read_only = opts.read_only; -+ -+ up_write(&c->state_lock); -+ } -+ -+ if (opts.errors >= 0) -+ c->opts.errors = opts.errors; -+err: -+ return bch2_err_class(ret); -+} -+ -+static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ struct bch_dev *ca; -+ unsigned i; -+ bool first = true; -+ -+ for_each_online_member(ca, c, i) { -+ if (!first) -+ seq_putc(seq, ':'); -+ first = false; -+ seq_puts(seq, "/dev/"); -+ seq_puts(seq, ca->name); -+ } -+ -+ return 0; -+} -+ -+static int bch2_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ enum bch_opt_id i; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->flags & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ printbuf_reset(&buf); -+ bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, -+ OPT_SHOW_MOUNT_STYLE); -+ seq_putc(seq, ','); -+ seq_puts(seq, buf.buf); -+ } -+ -+ if (buf.allocation_failure) -+ ret = -ENOMEM; -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static void bch2_put_super(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ __bch2_fs_stop(c); -+} -+ -+static const struct super_operations bch_super_operations = { -+ .alloc_inode = bch2_alloc_inode, -+ .destroy_inode = bch2_destroy_inode, -+ .write_inode = bch2_vfs_write_inode, -+ .evict_inode = bch2_evict_inode, -+ .sync_fs = bch2_sync_fs, -+ .statfs = bch2_statfs, -+ .show_devname = bch2_show_devname, -+ .show_options = bch2_show_options, -+ .remount_fs = bch2_remount, -+ .put_super = bch2_put_super, -+#if 0 -+ .freeze_fs = bch2_freeze, -+ .unfreeze_fs = bch2_unfreeze, -+#endif -+}; -+ -+static int bch2_set_super(struct super_block *s, void *data) -+{ -+ s->s_fs_info = data; -+ return 0; -+} -+ -+static int bch2_noset_super(struct super_block *s, void *data) -+{ -+ return -EBUSY; -+} -+ -+static int bch2_test_super(struct super_block *s, void *data) -+{ -+ struct bch_fs *c = s->s_fs_info; -+ struct bch_fs **devs = data; -+ unsigned i; -+ -+ if (!c) -+ return false; -+ -+ for (i = 0; devs[i]; i++) -+ if (c != devs[i]) -+ return false; -+ return true; -+} -+ -+static struct dentry *bch2_mount(struct file_system_type *fs_type, -+ int flags, const char *dev_name, void *data) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ struct super_block *sb; -+ struct inode *vinode; -+ struct bch_opts opts = bch2_opts_empty(); -+ char **devs; -+ struct bch_fs **devs_to_fs = NULL; -+ unsigned i, nr_devs; -+ int ret; -+ -+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(NULL, &opts, data); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (!dev_name || strlen(dev_name) == 0) -+ return ERR_PTR(-EINVAL); -+ -+ devs = split_devs(dev_name, &nr_devs); -+ if (!devs) -+ return ERR_PTR(-ENOMEM); -+ -+ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); -+ if (!devs_to_fs) { -+ sb = ERR_PTR(-ENOMEM); -+ goto got_sb; -+ } -+ -+ for (i = 0; i < nr_devs; i++) -+ devs_to_fs[i] = bch2_path_to_fs(devs[i]); -+ -+ sb = sget(fs_type, bch2_test_super, bch2_noset_super, -+ flags|SB_NOSEC, devs_to_fs); -+ if (!IS_ERR(sb)) -+ goto got_sb; -+ -+ c = bch2_fs_open(devs, nr_devs, opts); -+ if (IS_ERR(c)) { -+ sb = ERR_CAST(c); -+ goto got_sb; -+ } -+ -+ /* Some options can't be parsed until after the fs is started: */ -+ ret = bch2_parse_mount_opts(c, &opts, data); -+ if (ret) { -+ bch2_fs_stop(c); -+ sb = ERR_PTR(ret); -+ goto got_sb; -+ } -+ -+ bch2_opts_apply(&c->opts, opts); -+ -+ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); -+ if (IS_ERR(sb)) -+ bch2_fs_stop(c); -+got_sb: -+ kfree(devs_to_fs); -+ kfree(devs[0]); -+ kfree(devs); -+ -+ if (IS_ERR(sb)) { -+ ret = PTR_ERR(sb); -+ ret = bch2_err_class(ret); -+ return ERR_PTR(ret); -+ } -+ -+ c = sb->s_fs_info; -+ -+ if (sb->s_root) { -+ if ((flags ^ sb->s_flags) & SB_RDONLY) { -+ ret = -EBUSY; -+ goto err_put_super; -+ } -+ goto out; -+ } -+ -+ sb->s_blocksize = block_bytes(c); -+ sb->s_blocksize_bits = ilog2(block_bytes(c)); -+ sb->s_maxbytes = MAX_LFS_FILESIZE; -+ sb->s_op = &bch_super_operations; -+ sb->s_export_op = &bch_export_ops; -+#ifdef CONFIG_BCACHEFS_QUOTA -+ sb->s_qcop = &bch2_quotactl_operations; -+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -+#endif -+ sb->s_xattr = bch2_xattr_handlers; -+ sb->s_magic = BCACHEFS_STATFS_MAGIC; -+ sb->s_time_gran = c->sb.nsec_per_time_unit; -+ sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; -+ sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); -+ c->vfs_sb = sb; -+ strscpy(sb->s_id, c->name, sizeof(sb->s_id)); -+ -+ ret = super_setup_bdi(sb); -+ if (ret) -+ goto err_put_super; -+ -+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; -+ -+ for_each_online_member(ca, c, i) { -+ struct block_device *bdev = ca->disk_sb.bdev; -+ -+ /* XXX: create an anonymous device for multi device filesystems */ -+ sb->s_bdev = bdev; -+ sb->s_dev = bdev->bd_dev; -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ -+ c->dev = sb->s_dev; -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ if (c->opts.acl) -+ sb->s_flags |= SB_POSIXACL; -+#endif -+ -+ sb->s_shrink.seeks = 0; -+ -+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); -+ ret = PTR_ERR_OR_ZERO(vinode); -+ if (ret) { -+ bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); -+ goto err_put_super; -+ } -+ -+ sb->s_root = d_make_root(vinode); -+ if (!sb->s_root) { -+ bch_err(c, "error mounting: error allocating root dentry"); -+ ret = -ENOMEM; -+ goto err_put_super; -+ } -+ -+ sb->s_flags |= SB_ACTIVE; -+out: -+ return dget(sb->s_root); -+ -+err_put_super: -+ sb->s_fs_info = NULL; -+ c->vfs_sb = NULL; -+ deactivate_locked_super(sb); -+ bch2_fs_stop(c); -+ return ERR_PTR(bch2_err_class(ret)); -+} -+ -+static void bch2_kill_sb(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (c) -+ c->vfs_sb = NULL; -+ generic_shutdown_super(sb); -+ if (c) -+ bch2_fs_free(c); -+} -+ -+static struct file_system_type bcache_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "bcachefs", -+ .mount = bch2_mount, -+ .kill_sb = bch2_kill_sb, -+ .fs_flags = FS_REQUIRES_DEV, -+}; -+ -+MODULE_ALIAS_FS("bcachefs"); -+ -+void bch2_vfs_exit(void) -+{ -+ unregister_filesystem(&bcache_fs_type); -+ kmem_cache_destroy(bch2_inode_cache); -+} -+ -+int __init bch2_vfs_init(void) -+{ -+ int ret = -ENOMEM; -+ -+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); -+ if (!bch2_inode_cache) -+ goto err; -+ -+ ret = register_filesystem(&bcache_fs_type); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ bch2_vfs_exit(); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h -new file mode 100644 -index 000000000..10e11119d ---- /dev/null -+++ b/fs/bcachefs/fs.h -@@ -0,0 +1,209 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_H -+#define _BCACHEFS_FS_H -+ -+#include "inode.h" -+#include "opts.h" -+#include "str_hash.h" -+#include "quota_types.h" -+#include "two_state_shared_lock.h" -+ -+#include -+#include -+ -+struct bch_inode_info { -+ struct inode v; -+ struct list_head ei_vfs_inode_list; -+ unsigned long ei_flags; -+ -+ struct mutex ei_update_lock; -+ u64 ei_quota_reserved; -+ unsigned long ei_last_dirtied; -+ two_state_lock_t ei_pagecache_lock; -+ -+ struct mutex ei_quota_lock; -+ struct bch_qid ei_qid; -+ -+ u32 ei_subvol; -+ -+ /* -+ * When we've been doing nocow writes we'll need to issue flushes to the -+ * underlying block devices -+ * -+ * XXX: a device may have had a flush issued by some other codepath. It -+ * would be better to keep for each device a sequence number that's -+ * incremented when we isusue a cache flush, and track here the sequence -+ * number that needs flushing. -+ */ -+ struct bch_devs_mask ei_devs_need_flush; -+ -+ /* copy of inode in btree: */ -+ struct bch_inode_unpacked ei_inode; -+}; -+ -+#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) -+#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) -+#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) -+ -+#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) -+#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) -+ -+static inline subvol_inum inode_inum(struct bch_inode_info *inode) -+{ -+ return (subvol_inum) { -+ .subvol = inode->ei_subvol, -+ .inum = inode->ei_inode.bi_inum, -+ }; -+} -+ -+/* -+ * Set if we've gotten a btree error for this inode, and thus the vfs inode and -+ * btree inode may be inconsistent: -+ */ -+#define EI_INODE_ERROR 0 -+ -+/* -+ * Set in the inode is in a snapshot subvolume - we don't do quota accounting in -+ * those: -+ */ -+#define EI_INODE_SNAPSHOT 1 -+ -+#define to_bch_ei(_inode) \ -+ container_of_or_null(_inode, struct bch_inode_info, v) -+ -+static inline int ptrcmp(void *l, void *r) -+{ -+ return cmp_int(l, r); -+} -+ -+enum bch_inode_lock_op { -+ INODE_LOCK = (1U << 0), -+ INODE_PAGECACHE_BLOCK = (1U << 1), -+ INODE_UPDATE_LOCK = (1U << 2), -+}; -+ -+#define bch2_lock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ down_write_nested(&a[i]->v.i_rwsem, i); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_get(a[i]);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_lock_nested(&a[i]->ei_update_lock, i);\ -+ } \ -+} while (0) -+ -+#define bch2_unlock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ up_write(&a[i]->v.i_rwsem); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_put(a[i]);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_unlock(&a[i]->ei_update_lock); \ -+ } \ -+} while (0) -+ -+static inline struct bch_inode_info *file_bch_inode(struct file *file) -+{ -+ return to_bch_ei(file_inode(file)); -+} -+ -+static inline bool inode_attr_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode, -+ enum inode_opt_id id) -+{ -+ return !(inode->ei_inode.bi_fields_set & (1 << id)) && -+ bch2_inode_opt_get(&dir->ei_inode, id) != -+ bch2_inode_opt_get(&inode->ei_inode, id); -+} -+ -+static inline bool inode_attrs_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode) -+{ -+ unsigned id; -+ -+ for (id = 0; id < Inode_opt_nr; id++) -+ if (inode_attr_changing(dir, inode, id)) -+ return true; -+ -+ return false; -+} -+ -+struct bch_inode_unpacked; -+ -+#ifndef NO_BCACHEFS_FS -+ -+struct bch_inode_info * -+__bch2_create(struct mnt_idmap *, struct bch_inode_info *, -+ struct dentry *, umode_t, dev_t, subvol_inum, unsigned); -+ -+int bch2_fs_quota_transfer(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_qid, -+ unsigned, -+ enum quota_acct_mode); -+ -+static inline int bch2_set_projid(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ u32 projid) -+{ -+ struct bch_qid qid = inode->ei_qid; -+ -+ qid.q[QTYP_PRJ] = projid; -+ -+ return bch2_fs_quota_transfer(c, inode, qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); -+ -+/* returns 0 if we want to do the update, or error is passed up */ -+typedef int (*inode_set_fn)(struct btree_trans *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, void *); -+ -+void bch2_inode_update_after_write(struct btree_trans *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, -+ unsigned); -+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, -+ inode_set_fn, void *, unsigned); -+ -+int bch2_setattr_nonsize(struct mnt_idmap *, -+ struct bch_inode_info *, -+ struct iattr *); -+int __bch2_unlink(struct inode *, struct dentry *, bool); -+ -+void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); -+ -+void bch2_vfs_exit(void); -+int bch2_vfs_init(void); -+ -+#else -+ -+#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) -+ -+static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, -+ snapshot_id_list *s) {} -+static inline void bch2_vfs_exit(void) {} -+static inline int bch2_vfs_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_FS_H */ -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -new file mode 100644 -index 000000000..238caeeaf ---- /dev/null -+++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2483 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "darray.h" -+#include "dirent.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "inode.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "snapshot.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include -+#include /* struct qstr */ -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+/* -+ * XXX: this is handling transaction restarts without returning -+ * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: -+ */ -+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, -+ u32 snapshot) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u64 sectors = 0; -+ int ret; -+ -+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents, -+ SPOS(inum, 0, snapshot), -+ POS(inum, U64_MAX), -+ 0, k, ret) -+ if (bkey_extent_is_allocation(k.k)) -+ sectors += k.k->size; -+ -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret ?: sectors; -+} -+ -+static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, -+ u32 snapshot) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ u64 subdirs = 0; -+ int ret; -+ -+ for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, -+ SPOS(inum, 0, snapshot), -+ POS(inum, U64_MAX), -+ 0, k, ret) { -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ d = bkey_s_c_to_dirent(k); -+ if (d.v->d_type == DT_DIR) -+ subdirs++; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret ?: subdirs; -+} -+ -+static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, -+ u32 *subvol) -+{ -+ struct bch_snapshot s; -+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, -+ POS(0, snapshot), 0, -+ snapshot, &s); -+ if (!ret) -+ *subvol = le32_to_cpu(s.subvol); -+ else if (bch2_err_matches(ret, ENOENT)) -+ bch_err(trans->c, "snapshot %u not fonud", snapshot); -+ return ret; -+ -+} -+ -+static int __subvol_lookup(struct btree_trans *trans, u32 subvol, -+ u32 *snapshot, u64 *inum) -+{ -+ struct bch_subvolume s; -+ int ret; -+ -+ ret = bch2_subvolume_get(trans, subvol, false, 0, &s); -+ -+ *snapshot = le32_to_cpu(s.snapshot); -+ *inum = le64_to_cpu(s.inode); -+ return ret; -+} -+ -+static int subvol_lookup(struct btree_trans *trans, u32 subvol, -+ u32 *snapshot, u64 *inum) -+{ -+ return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); -+} -+ -+static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, -+ POS(0, inode_nr), -+ BTREE_ITER_ALL_SNAPSHOTS); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { -+ ret = -BCH_ERR_ENOENT_inode; -+ goto err; -+ } -+ -+ ret = bch2_inode_unpack(k, inode); -+err: -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(trans->c, "error fetching inode %llu: %s", -+ inode_nr, bch2_err_str(ret)); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode, -+ u32 *snapshot) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inode_nr, *snapshot), 0); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = bkey_is_inode(k.k) -+ ? bch2_inode_unpack(k, inode) -+ : -BCH_ERR_ENOENT_inode; -+ if (!ret) -+ *snapshot = iter.pos.snapshot; -+err: -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(trans->c, "error fetching inode %llu:%u: %s", -+ inode_nr, *snapshot, bch2_err_str(ret)); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int lookup_inode(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode, -+ u32 *snapshot) -+{ -+ return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); -+} -+ -+static int __lookup_dirent(struct btree_trans *trans, -+ struct bch_hash_info hash_info, -+ subvol_inum dir, struct qstr *name, -+ u64 *target, unsigned *type) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c_dirent d; -+ int ret; -+ -+ ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, -+ &hash_info, dir, name, 0); -+ if (ret) -+ return ret; -+ -+ d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); -+ *target = le64_to_cpu(d.v->d_inum); -+ *type = d.v->d_type; -+ bch2_trans_iter_exit(trans, &iter); -+ return 0; -+} -+ -+static int __write_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u32 snapshot) -+{ -+ struct bkey_inode_buf *inode_p = -+ bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+ -+ bch2_inode_pack(inode_p, inode); -+ inode_p->inode.k.p.snapshot = snapshot; -+ -+ return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, -+ &inode_p->inode.k_i, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+} -+ -+static int write_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u32 snapshot) -+{ -+ int ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __write_inode(trans, inode, snapshot)); -+ if (ret) -+ bch_err(trans->c, "error in fsck: error updating inode: %s", -+ bch2_err_str(ret)); -+ return ret; -+} -+ -+static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bch_inode_unpacked dir_inode; -+ struct bch_hash_info dir_hash_info; -+ int ret; -+ -+ ret = lookup_first_inode(trans, pos.inode, &dir_inode); -+ if (ret) -+ goto err; -+ -+ dir_hash_info = bch2_hash_info_init(c, &dir_inode); -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); -+ -+ ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ &dir_hash_info, &iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ bch2_trans_iter_exit(trans, &iter); -+err: -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* Get lost+found, create if it doesn't exist: */ -+static int lookup_lostfound(struct btree_trans *trans, u32 subvol, -+ struct bch_inode_unpacked *lostfound) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked root; -+ struct bch_hash_info root_hash_info; -+ struct qstr lostfound_str = QSTR("lost+found"); -+ subvol_inum root_inum = { .subvol = subvol }; -+ u64 inum = 0; -+ unsigned d_type = 0; -+ u32 snapshot; -+ int ret; -+ -+ ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); -+ if (ret) -+ return ret; -+ -+ ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); -+ if (ret) -+ return ret; -+ -+ root_hash_info = bch2_hash_info_init(c, &root); -+ -+ ret = __lookup_dirent(trans, root_hash_info, root_inum, -+ &lostfound_str, &inum, &d_type); -+ if (bch2_err_matches(ret, ENOENT)) { -+ bch_notice(c, "creating lost+found"); -+ goto create_lostfound; -+ } -+ -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); -+ if (ret) -+ return ret; -+ -+ if (d_type != DT_DIR) { -+ bch_err(c, "error looking up lost+found: not a directory"); -+ return ret; -+ } -+ -+ /* -+ * The bch2_check_dirents pass has already run, dangling dirents -+ * shouldn't exist here: -+ */ -+ return __lookup_inode(trans, inum, lostfound, &snapshot); -+ -+create_lostfound: -+ bch2_inode_init_early(c, lostfound); -+ -+ ret = bch2_create_trans(trans, root_inum, &root, -+ lostfound, &lostfound_str, -+ 0, 0, S_IFDIR|0700, 0, NULL, NULL, -+ (subvol_inum) { }, 0); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); -+ return ret; -+} -+ -+static int __reattach_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u32 inode_snapshot) -+{ -+ struct bch_hash_info dir_hash; -+ struct bch_inode_unpacked lostfound; -+ char name_buf[20]; -+ struct qstr name; -+ u64 dir_offset = 0; -+ u32 subvol; -+ int ret; -+ -+ ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); -+ if (ret) -+ return ret; -+ -+ ret = lookup_lostfound(trans, subvol, &lostfound); -+ if (ret) -+ return ret; -+ -+ if (S_ISDIR(inode->bi_mode)) { -+ lostfound.bi_nlink++; -+ -+ ret = __write_inode(trans, &lostfound, U32_MAX); -+ if (ret) -+ return ret; -+ } -+ -+ dir_hash = bch2_hash_info_init(trans->c, &lostfound); -+ -+ snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); -+ name = (struct qstr) QSTR(name_buf); -+ -+ ret = bch2_dirent_create(trans, -+ (subvol_inum) { -+ .subvol = subvol, -+ .inum = lostfound.bi_inum, -+ }, -+ &dir_hash, -+ inode_d_type(inode), -+ &name, inode->bi_inum, &dir_offset, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ return ret; -+ -+ inode->bi_dir = lostfound.bi_inum; -+ inode->bi_dir_offset = dir_offset; -+ -+ return __write_inode(trans, inode, inode_snapshot); -+} -+ -+static int reattach_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u32 inode_snapshot) -+{ -+ int ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL, -+ __reattach_inode(trans, inode, inode_snapshot)); -+ if (ret) { -+ bch_err(trans->c, "error reattaching inode %llu: %s", -+ inode->bi_inum, bch2_err_str(ret)); -+ return ret; -+ } -+ -+ return ret; -+} -+ -+static int remove_backpointer(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c_dirent d; -+ int ret; -+ -+ d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, -+ POS(inode->bi_dir, inode->bi_dir_offset), 0, -+ dirent); -+ ret = bkey_err(d) ?: -+ __remove_dirent(trans, d.k->p); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+struct snapshots_seen_entry { -+ u32 id; -+ u32 equiv; -+}; -+ -+struct snapshots_seen { -+ struct bpos pos; -+ DARRAY(struct snapshots_seen_entry) ids; -+}; -+ -+static inline void snapshots_seen_exit(struct snapshots_seen *s) -+{ -+ darray_exit(&s->ids); -+} -+ -+static inline void snapshots_seen_init(struct snapshots_seen *s) -+{ -+ memset(s, 0, sizeof(*s)); -+} -+ -+static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s, u32 id) -+{ -+ struct snapshots_seen_entry *i, n = { -+ .id = id, -+ .equiv = bch2_snapshot_equiv(c, id), -+ }; -+ int ret = 0; -+ -+ darray_for_each(s->ids, i) { -+ if (i->id == id) -+ return 0; -+ if (i->id > id) -+ break; -+ } -+ -+ ret = darray_insert_item(&s->ids, i - s->ids.data, n); -+ if (ret) -+ bch_err(c, "error reallocating snapshots_seen table (size %zu)", -+ s->ids.size); -+ return ret; -+} -+ -+static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, -+ enum btree_id btree_id, struct bpos pos) -+{ -+ struct snapshots_seen_entry *i, n = { -+ .id = pos.snapshot, -+ .equiv = bch2_snapshot_equiv(c, pos.snapshot), -+ }; -+ int ret = 0; -+ -+ if (!bkey_eq(s->pos, pos)) -+ s->ids.nr = 0; -+ -+ s->pos = pos; -+ s->pos.snapshot = n.equiv; -+ -+ darray_for_each(s->ids, i) { -+ if (i->id == n.id) -+ return 0; -+ -+ /* -+ * We currently don't rigorously track for snapshot cleanup -+ * needing to be run, so it shouldn't be a fsck error yet: -+ */ -+ if (i->equiv == n.equiv) { -+ bch_err(c, "snapshot deletion did not finish:\n" -+ " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", -+ bch2_btree_ids[btree_id], -+ pos.inode, pos.offset, -+ i->id, n.id, n.equiv); -+ return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots); -+ } -+ } -+ -+ ret = darray_push(&s->ids, n); -+ if (ret) -+ bch_err(c, "error reallocating snapshots_seen table (size %zu)", -+ s->ids.size); -+ return ret; -+} -+ -+/** -+ * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, -+ * and @ancestor hasn't been overwritten in @seen -+ * -+ * That is, returns whether key in @ancestor snapshot is visible in @id snapshot -+ */ -+static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, -+ u32 id, u32 ancestor) -+{ -+ ssize_t i; -+ -+ EBUG_ON(id > ancestor); -+ EBUG_ON(!bch2_snapshot_is_equiv(c, id)); -+ EBUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); -+ -+ /* @ancestor should be the snapshot most recently added to @seen */ -+ EBUG_ON(ancestor != seen->pos.snapshot); -+ EBUG_ON(ancestor != seen->ids.data[seen->ids.nr - 1].equiv); -+ -+ if (id == ancestor) -+ return true; -+ -+ if (!bch2_snapshot_is_ancestor(c, id, ancestor)) -+ return false; -+ -+ /* -+ * We know that @id is a descendant of @ancestor, we're checking if -+ * we've seen a key that overwrote @ancestor - i.e. also a descendent of -+ * @ascestor and with @id as a descendent. -+ * -+ * But we already know that we're scanning IDs between @id and @ancestor -+ * numerically, since snapshot ID lists are kept sorted, so if we find -+ * an id that's an ancestor of @id we're done: -+ */ -+ -+ for (i = seen->ids.nr - 2; -+ i >= 0 && seen->ids.data[i].equiv >= id; -+ --i) -+ if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv)) -+ return false; -+ -+ return true; -+} -+ -+/** -+ * ref_visible - given a key with snapshot id @src that points to a key with -+ * snapshot id @dst, test whether there is some snapshot in which @dst is -+ * visible. -+ * -+ * This assumes we're visiting @src keys in natural key order. -+ * -+ * @s - list of snapshot IDs already seen at @src -+ * @src - snapshot ID of src key -+ * @dst - snapshot ID of dst key -+ */ -+static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, -+ u32 src, u32 dst) -+{ -+ return dst <= src -+ ? key_visible_in_snapshot(c, s, dst, src) -+ : bch2_snapshot_is_ancestor(c, src, dst); -+} -+ -+static int ref_visible2(struct bch_fs *c, -+ u32 src, struct snapshots_seen *src_seen, -+ u32 dst, struct snapshots_seen *dst_seen) -+{ -+ src = bch2_snapshot_equiv(c, src); -+ dst = bch2_snapshot_equiv(c, dst); -+ -+ if (dst > src) { -+ swap(dst, src); -+ swap(dst_seen, src_seen); -+ } -+ return key_visible_in_snapshot(c, src_seen, dst, src); -+} -+ -+#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ -+ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ -+ (_i)->snapshot <= (_snapshot); _i++) \ -+ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) -+ -+struct inode_walker_entry { -+ struct bch_inode_unpacked inode; -+ u32 snapshot; -+ bool seen_this_pos; -+ u64 count; -+}; -+ -+struct inode_walker { -+ bool first_this_inode; -+ bool recalculate_sums; -+ struct bpos last_pos; -+ -+ DARRAY(struct inode_walker_entry) inodes; -+}; -+ -+static void inode_walker_exit(struct inode_walker *w) -+{ -+ darray_exit(&w->inodes); -+} -+ -+static struct inode_walker inode_walker_init(void) -+{ -+ return (struct inode_walker) { 0, }; -+} -+ -+static int add_inode(struct bch_fs *c, struct inode_walker *w, -+ struct bkey_s_c inode) -+{ -+ struct bch_inode_unpacked u; -+ -+ BUG_ON(bch2_inode_unpack(inode, &u)); -+ -+ return darray_push(&w->inodes, ((struct inode_walker_entry) { -+ .inode = u, -+ .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), -+ })); -+} -+ -+static int get_inodes_all_snapshots(struct btree_trans *trans, -+ struct inode_walker *w, u64 inum) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u32 restart_count = trans->restart_count; -+ int ret; -+ -+ w->recalculate_sums = false; -+ w->inodes.nr = 0; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ if (k.k->p.offset != inum) -+ break; -+ -+ if (bkey_is_inode(k.k)) -+ add_inode(c, w, k); -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) -+ return ret; -+ -+ w->first_this_inode = true; -+ -+ if (trans_was_restarted(trans, restart_count)) -+ return -BCH_ERR_transaction_restart_nested; -+ -+ return 0; -+} -+ -+static struct inode_walker_entry * -+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, -+ u32 snapshot, bool is_whiteout) -+{ -+ struct inode_walker_entry *i; -+ -+ snapshot = bch2_snapshot_equiv(c, snapshot); -+ -+ darray_for_each(w->inodes, i) -+ if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) -+ goto found; -+ -+ return NULL; -+found: -+ BUG_ON(snapshot > i->snapshot); -+ -+ if (snapshot != i->snapshot && !is_whiteout) { -+ struct inode_walker_entry new = *i; -+ size_t pos; -+ int ret; -+ -+ new.snapshot = snapshot; -+ new.count = 0; -+ -+ bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", -+ w->last_pos.inode, snapshot, i->snapshot); -+ -+ while (i > w->inodes.data && i[-1].snapshot > snapshot) -+ --i; -+ -+ pos = i - w->inodes.data; -+ ret = darray_insert_item(&w->inodes, pos, new); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ i = w->inodes.data + pos; -+ } -+ -+ return i; -+} -+ -+static struct inode_walker_entry *walk_inode(struct btree_trans *trans, -+ struct inode_walker *w, struct bpos pos, -+ bool is_whiteout) -+{ -+ if (w->last_pos.inode != pos.inode) { -+ int ret = get_inodes_all_snapshots(trans, w, pos.inode); -+ if (ret) -+ return ERR_PTR(ret); -+ } else if (bkey_cmp(w->last_pos, pos)) { -+ struct inode_walker_entry *i; -+ -+ darray_for_each(w->inodes, i) -+ i->seen_this_pos = false; -+ -+ } -+ -+ w->last_pos = pos; -+ -+ return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); -+} -+ -+static int __get_visible_inodes(struct btree_trans *trans, -+ struct inode_walker *w, -+ struct snapshots_seen *s, -+ u64 inum) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ w->inodes.nr = 0; -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ if (k.k->p.offset != inum) -+ break; -+ -+ if (!ref_visible(c, s, s->pos.snapshot, equiv)) -+ continue; -+ -+ if (bkey_is_inode(k.k)) -+ add_inode(c, w, k); -+ -+ if (equiv >= s->pos.snapshot) -+ break; -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static int check_key_has_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, -+ "key in missing snapshot: %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ ret = bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int hash_redo_key(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bkey_i *delete; -+ struct bkey_i *tmp; -+ -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ if (IS_ERR(delete)) -+ return PTR_ERR(delete); -+ -+ tmp = bch2_bkey_make_mut_noupdate(trans, k); -+ if (IS_ERR(tmp)) -+ return PTR_ERR(tmp); -+ -+ bkey_init(&delete->k); -+ delete->k.p = k_iter->pos; -+ return bch2_btree_iter_traverse(k_iter) ?: -+ bch2_trans_update(trans, k_iter, delete, 0) ?: -+ bch2_hash_set_snapshot(trans, desc, hash_info, -+ (subvol_inum) { 0, k.k->p.inode }, -+ k.k->p.snapshot, tmp, -+ BCH_HASH_SET_MUST_CREATE, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+} -+ -+static int hash_check_key(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *hash_info, -+ struct btree_iter *k_iter, struct bkey_s_c hash_k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter = { NULL }; -+ struct printbuf buf = PRINTBUF; -+ struct bkey_s_c k; -+ u64 hash; -+ int ret = 0; -+ -+ if (hash_k.k->type != desc.key_type) -+ return 0; -+ -+ hash = desc.hash_bkey(hash_info, hash_k); -+ -+ if (likely(hash == hash_k.k->p.offset)) -+ return 0; -+ -+ if (hash_k.k->p.offset < hash) -+ goto bad_hash; -+ -+ for_each_btree_key_norestart(trans, iter, desc.btree_id, -+ SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), -+ BTREE_ITER_SLOTS, k, ret) { -+ if (bkey_eq(k.k->p, hash_k.k->p)) -+ break; -+ -+ if (fsck_err_on(k.k->type == desc.key_type && -+ !desc.cmp_bkey(k, hash_k), c, -+ "duplicate hash table keys:\n%s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, hash_k), -+ buf.buf))) { -+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; -+ break; -+ } -+ -+ if (bkey_deleted(k.k)) { -+ bch2_trans_iter_exit(trans, &iter); -+ goto bad_hash; -+ } -+ } -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+ return ret; -+bad_hash: -+ if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", -+ bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { -+ ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); -+ if (ret) -+ return ret; -+ ret = -BCH_ERR_transaction_restart_nested; -+ } -+fsck_err: -+ goto out; -+} -+ -+static int check_inode(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bch_inode_unpacked *prev, -+ struct snapshots_seen *s, -+ bool full) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ bool do_update = false; -+ int ret; -+ -+ ret = check_key_has_snapshot(trans, iter, k); -+ if (ret < 0) -+ goto err; -+ if (ret) -+ return 0; -+ -+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); -+ if (ret) -+ goto err; -+ -+ if (!bkey_is_inode(k.k)) -+ return 0; -+ -+ BUG_ON(bch2_inode_unpack(k, &u)); -+ -+ if (!full && -+ !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| -+ BCH_INODE_I_SECTORS_DIRTY| -+ BCH_INODE_UNLINKED))) -+ return 0; -+ -+ if (prev->bi_inum != u.bi_inum) -+ *prev = u; -+ -+ if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || -+ inode_d_type(prev) != inode_d_type(&u), c, -+ "inodes in different snapshots don't match")) { -+ bch_err(c, "repair not implemented yet"); -+ return -EINVAL; -+ } -+ -+ if ((u.bi_flags & (BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED)) && -+ bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) { -+ struct bpos new_min_pos; -+ -+ ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos); -+ if (ret) -+ goto err; -+ -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY|BCH_INODE_UNLINKED; -+ -+ ret = __write_inode(trans, &u, iter->pos.snapshot); -+ if (ret) { -+ bch_err_msg(c, ret, "in fsck: error updating inode"); -+ return ret; -+ } -+ -+ if (!bpos_eq(new_min_pos, POS_MIN)) -+ bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos)); -+ return 0; -+ } -+ -+ if (u.bi_flags & BCH_INODE_UNLINKED && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", -+ u.bi_inum))) { -+ bch2_trans_unlock(trans); -+ bch2_fs_lazy_rw(c); -+ -+ ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error in fsck: error while deleting inode: %s", -+ bch2_err_str(ret)); -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", -+ u.bi_inum))) { -+ bch_verbose(c, "truncating inode %llu", u.bi_inum); -+ -+ bch2_trans_unlock(trans); -+ bch2_fs_lazy_rw(c); -+ -+ /* -+ * XXX: need to truncate partial blocks too here - or ideally -+ * just switch units to bytes and that issue goes away -+ */ -+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, -+ SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, -+ iter->pos.snapshot), -+ POS(u.bi_inum, U64_MAX), -+ 0, NULL); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err(c, "error in fsck: error truncating inode: %s", -+ bch2_err_str(ret)); -+ if (ret) -+ return ret; -+ -+ /* -+ * We truncated without our normal sector accounting hook, just -+ * make sure we recalculate it: -+ */ -+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; -+ -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ do_update = true; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", -+ u.bi_inum))) { -+ s64 sectors; -+ -+ bch_verbose(c, "recounting sectors for inode %llu", -+ u.bi_inum); -+ -+ sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); -+ if (sectors < 0) { -+ bch_err(c, "error in fsck: error recounting inode sectors: %s", -+ bch2_err_str(sectors)); -+ return sectors; -+ } -+ -+ u.bi_sectors = sectors; -+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; -+ do_update = true; -+ } -+ -+ if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { -+ u.bi_dir = 0; -+ u.bi_dir_offset = 0; -+ u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; -+ do_update = true; -+ } -+ -+ if (do_update) { -+ ret = __write_inode(trans, &u, iter->pos.snapshot); -+ if (ret) { -+ bch_err_msg(c, ret, "in fsck: error updating inode"); -+ return ret; -+ } -+ } -+err: -+fsck_err: -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+noinline_for_stack -+int bch2_check_inodes(struct bch_fs *c) -+{ -+ bool full = c->opts.fsck; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bch_inode_unpacked prev = { 0 }; -+ struct snapshots_seen s; -+ struct bkey_s_c k; -+ int ret; -+ -+ snapshots_seen_init(&s); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, -+ POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_inode(&trans, &iter, k, &prev, &s, full)); -+ -+ bch2_trans_exit(&trans); -+ snapshots_seen_exit(&s); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos pos) -+{ -+ return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -+} -+ -+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, -+ struct bkey_s_c_dirent d) -+{ -+ return inode->bi_dir == d.k->p.inode && -+ inode->bi_dir_offset == d.k->p.offset; -+} -+ -+static bool dirent_points_to_inode(struct bkey_s_c_dirent d, -+ struct bch_inode_unpacked *inode) -+{ -+ return d.v->d_type == DT_SUBVOL -+ ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol -+ : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -+} -+ -+static int inode_backpointer_exists(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u32 snapshot) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c_dirent d; -+ int ret; -+ -+ d = dirent_get_by_pos(trans, &iter, -+ SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); -+ ret = bkey_err(d); -+ if (ret) -+ return bch2_err_matches(ret, ENOENT) ? 0 : ret; -+ -+ ret = dirent_points_to_inode(d, inode); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) -+{ -+ struct bch_fs *c = trans->c; -+ struct inode_walker_entry *i; -+ u32 restart_count = trans->restart_count; -+ int ret = 0; -+ s64 count2; -+ -+ darray_for_each(w->inodes, i) { -+ if (i->inode.bi_sectors == i->count) -+ continue; -+ -+ count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); -+ -+ if (w->recalculate_sums) -+ i->count = count2; -+ -+ if (i->count != count2) { -+ bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", -+ w->last_pos.inode, i->snapshot, i->count, count2); -+ return -BCH_ERR_internal_fsck_err; -+ } -+ -+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, -+ "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", -+ w->last_pos.inode, i->snapshot, -+ i->inode.bi_sectors, i->count)) { -+ i->inode.bi_sectors = i->count; -+ ret = write_inode(trans, &i->inode, i->snapshot); -+ if (ret) -+ break; -+ } -+ } -+fsck_err: -+ if (ret) -+ bch_err_fn(c, ret); -+ if (!ret && trans_was_restarted(trans, restart_count)) -+ ret = -BCH_ERR_transaction_restart_nested; -+ return ret; -+} -+ -+struct extent_end { -+ u32 snapshot; -+ u64 offset; -+ struct snapshots_seen seen; -+}; -+ -+struct extent_ends { -+ struct bpos last_pos; -+ DARRAY(struct extent_end) e; -+}; -+ -+static void extent_ends_reset(struct extent_ends *extent_ends) -+{ -+ struct extent_end *i; -+ -+ darray_for_each(extent_ends->e, i) -+ snapshots_seen_exit(&i->seen); -+ -+ extent_ends->e.nr = 0; -+} -+ -+static void extent_ends_exit(struct extent_ends *extent_ends) -+{ -+ extent_ends_reset(extent_ends); -+ darray_exit(&extent_ends->e); -+} -+ -+static void extent_ends_init(struct extent_ends *extent_ends) -+{ -+ memset(extent_ends, 0, sizeof(*extent_ends)); -+} -+ -+static int extent_ends_at(struct bch_fs *c, -+ struct extent_ends *extent_ends, -+ struct snapshots_seen *seen, -+ struct bkey_s_c k) -+{ -+ struct extent_end *i, n = (struct extent_end) { -+ .offset = k.k->p.offset, -+ .snapshot = k.k->p.snapshot, -+ .seen = *seen, -+ }; -+ -+ n.seen.ids.data = kmemdup(seen->ids.data, -+ sizeof(seen->ids.data[0]) * seen->ids.size, -+ GFP_KERNEL); -+ if (!n.seen.ids.data) -+ return -BCH_ERR_ENOMEM_fsck_extent_ends_at; -+ -+ darray_for_each(extent_ends->e, i) { -+ if (i->snapshot == k.k->p.snapshot) { -+ snapshots_seen_exit(&i->seen); -+ *i = n; -+ return 0; -+ } -+ -+ if (i->snapshot >= k.k->p.snapshot) -+ break; -+ } -+ -+ return darray_insert_item(&extent_ends->e, i - extent_ends->e.data, n); -+} -+ -+static int overlapping_extents_found(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bpos pos1, struct snapshots_seen *pos1_seen, -+ struct bkey pos2, -+ bool *fixed, -+ struct extent_end *extent_end) -+{ -+ struct bch_fs *c = trans->c; -+ struct printbuf buf = PRINTBUF; -+ struct btree_iter iter1, iter2 = { NULL }; -+ struct bkey_s_c k1, k2; -+ int ret; -+ -+ BUG_ON(bkey_le(pos1, bkey_start_pos(&pos2))); -+ -+ bch2_trans_iter_init(trans, &iter1, btree, pos1, -+ BTREE_ITER_ALL_SNAPSHOTS| -+ BTREE_ITER_NOT_EXTENTS); -+ k1 = bch2_btree_iter_peek_upto(&iter1, POS(pos1.inode, U64_MAX)); -+ ret = bkey_err(k1); -+ if (ret) -+ goto err; -+ -+ prt_str(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k1); -+ -+ if (!bpos_eq(pos1, k1.k->p)) { -+ prt_str(&buf, "\n wanted\n "); -+ bch2_bpos_to_text(&buf, pos1); -+ prt_str(&buf, "\n "); -+ bch2_bkey_to_text(&buf, &pos2); -+ -+ bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", -+ __func__, buf.buf); -+ ret = -BCH_ERR_internal_fsck_err; -+ goto err; -+ } -+ -+ bch2_trans_copy_iter(&iter2, &iter1); -+ -+ while (1) { -+ bch2_btree_iter_advance(&iter2); -+ -+ k2 = bch2_btree_iter_peek_upto(&iter2, POS(pos1.inode, U64_MAX)); -+ ret = bkey_err(k2); -+ if (ret) -+ goto err; -+ -+ if (bpos_ge(k2.k->p, pos2.p)) -+ break; -+ } -+ -+ prt_str(&buf, "\n "); -+ bch2_bkey_val_to_text(&buf, c, k2); -+ -+ if (bpos_gt(k2.k->p, pos2.p) || -+ pos2.size != k2.k->size) { -+ bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", -+ __func__, buf.buf); -+ ret = -BCH_ERR_internal_fsck_err; -+ goto err; -+ } -+ -+ prt_printf(&buf, "\n overwriting %s extent", -+ pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); -+ -+ if (fsck_err(c, "overlapping extents%s", buf.buf)) { -+ struct btree_iter *old_iter = &iter1; -+ struct disk_reservation res = { 0 }; -+ -+ if (pos1.snapshot < pos2.p.snapshot) { -+ old_iter = &iter2; -+ swap(k1, k2); -+ } -+ -+ trans->extra_journal_res += bch2_bkey_sectors_compressed(k2); -+ -+ ret = bch2_trans_update_extent_overwrite(trans, old_iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE, -+ k1, k2) ?: -+ bch2_trans_commit(trans, &res, NULL, -+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(c, &res); -+ -+ if (ret) -+ goto err; -+ -+ *fixed = true; -+ -+ if (pos1.snapshot == pos2.p.snapshot) { -+ /* -+ * We overwrote the first extent, and did the overwrite -+ * in the same snapshot: -+ */ -+ extent_end->offset = bkey_start_offset(&pos2); -+ } else if (pos1.snapshot > pos2.p.snapshot) { -+ /* -+ * We overwrote the first extent in pos2's snapshot: -+ */ -+ ret = snapshots_seen_add_inorder(c, pos1_seen, pos2.p.snapshot); -+ } else { -+ /* -+ * We overwrote the second extent - restart -+ * check_extent() from the top: -+ */ -+ ret = -BCH_ERR_transaction_restart_nested; -+ } -+ } -+fsck_err: -+err: -+ bch2_trans_iter_exit(trans, &iter2); -+ bch2_trans_iter_exit(trans, &iter1); -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int check_overlapping_extents(struct btree_trans *trans, -+ struct snapshots_seen *seen, -+ struct extent_ends *extent_ends, -+ struct bkey_s_c k, -+ u32 equiv, -+ struct btree_iter *iter, -+ bool *fixed) -+{ -+ struct bch_fs *c = trans->c; -+ struct extent_end *i; -+ int ret = 0; -+ -+ /* transaction restart, running again */ -+ if (bpos_eq(extent_ends->last_pos, k.k->p)) -+ return 0; -+ -+ if (extent_ends->last_pos.inode != k.k->p.inode) -+ extent_ends_reset(extent_ends); -+ -+ darray_for_each(extent_ends->e, i) { -+ if (i->offset <= bkey_start_offset(k.k)) -+ continue; -+ -+ if (!ref_visible2(c, -+ k.k->p.snapshot, seen, -+ i->snapshot, &i->seen)) -+ continue; -+ -+ ret = overlapping_extents_found(trans, iter->btree_id, -+ SPOS(iter->pos.inode, -+ i->offset, -+ i->snapshot), -+ &i->seen, -+ *k.k, fixed, i); -+ if (ret) -+ goto err; -+ } -+ -+ ret = extent_ends_at(c, extent_ends, seen, k); -+ if (ret) -+ goto err; -+ -+ extent_ends->last_pos = k.k->p; -+err: -+ return ret; -+} -+ -+static int check_extent(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct inode_walker *inode, -+ struct snapshots_seen *s, -+ struct extent_ends *extent_ends) -+{ -+ struct bch_fs *c = trans->c; -+ struct inode_walker_entry *i; -+ struct printbuf buf = PRINTBUF; -+ struct bpos equiv = k.k->p; -+ int ret = 0; -+ -+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ ret = check_key_has_snapshot(trans, iter, k); -+ if (ret) { -+ ret = ret < 0 ? ret : 0; -+ goto out; -+ } -+ -+ if (inode->last_pos.inode != k.k->p.inode) { -+ ret = check_i_sectors(trans, inode); -+ if (ret) -+ goto err; -+ } -+ -+ i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); -+ ret = PTR_ERR_OR_ZERO(i); -+ if (ret) -+ goto err; -+ -+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_whiteout) { -+ if (fsck_err_on(!i, c, -+ "extent in missing inode:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ goto delete; -+ -+ if (fsck_err_on(i && -+ !S_ISREG(i->inode.bi_mode) && -+ !S_ISLNK(i->inode.bi_mode), c, -+ "extent in non regular inode mode %o:\n %s", -+ i->inode.bi_mode, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) -+ goto delete; -+ -+ ret = check_overlapping_extents(trans, s, extent_ends, k, -+ equiv.snapshot, iter, -+ &inode->recalculate_sums); -+ if (ret) -+ goto err; -+ } -+ -+ /* -+ * Check inodes in reverse order, from oldest snapshots to newest, -+ * starting from the inode that matches this extent's snapshot. If we -+ * didn't have one, iterate over all inodes: -+ */ -+ if (!i) -+ i = inode->inodes.data + inode->inodes.nr - 1; -+ -+ for (; -+ inode->inodes.data && i >= inode->inodes.data; -+ --i) { -+ if (i->snapshot > equiv.snapshot || -+ !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) -+ continue; -+ -+ if (k.k->type != KEY_TYPE_whiteout) { -+ if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && -+ !bkey_extent_is_reservation(k), c, -+ "extent type past end of inode %llu:%u, i_size %llu\n %s", -+ i->inode.bi_inum, i->snapshot, i->inode.bi_size, -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ struct btree_iter iter2; -+ -+ bch2_trans_copy_iter(&iter2, iter); -+ bch2_btree_iter_set_snapshot(&iter2, i->snapshot); -+ ret = bch2_btree_iter_traverse(&iter2) ?: -+ bch2_btree_delete_at(trans, &iter2, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ bch2_trans_iter_exit(trans, &iter2); -+ if (ret) -+ goto err; -+ -+ iter->k.type = KEY_TYPE_whiteout; -+ } -+ -+ if (bkey_extent_is_allocation(k.k)) -+ i->count += k.k->size; -+ } -+ -+ i->seen_this_pos = true; -+ } -+out: -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_fn(c, ret); -+ return ret; -+delete: -+ ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ goto out; -+} -+ -+/* -+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and -+ * that i_size an i_sectors are consistent -+ */ -+int bch2_check_extents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct snapshots_seen s; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct extent_ends extent_ends; -+ struct disk_reservation res = { 0 }; -+ int ret = 0; -+ -+ snapshots_seen_init(&s); -+ extent_ends_init(&extent_ends); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ &res, NULL, -+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ -+ bch2_disk_reservation_put(c, &res); -+ check_extent(&trans, &iter, k, &w, &s, &extent_ends); -+ })) ?: -+ check_i_sectors(&trans, &w); -+ -+ bch2_disk_reservation_put(c, &res); -+ extent_ends_exit(&extent_ends); -+ inode_walker_exit(&w); -+ bch2_trans_exit(&trans); -+ snapshots_seen_exit(&s); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) -+{ -+ struct bch_fs *c = trans->c; -+ struct inode_walker_entry *i; -+ u32 restart_count = trans->restart_count; -+ int ret = 0; -+ s64 count2; -+ -+ darray_for_each(w->inodes, i) { -+ if (i->inode.bi_nlink == i->count) -+ continue; -+ -+ count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); -+ if (count2 < 0) -+ return count2; -+ -+ if (i->count != count2) { -+ bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", -+ i->count, count2); -+ i->count = count2; -+ if (i->inode.bi_nlink == i->count) -+ continue; -+ } -+ -+ if (fsck_err_on(i->inode.bi_nlink != i->count, c, -+ "directory %llu:%u with wrong i_nlink: got %u, should be %llu", -+ w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { -+ i->inode.bi_nlink = i->count; -+ ret = write_inode(trans, &i->inode, i->snapshot); -+ if (ret) -+ break; -+ } -+ } -+fsck_err: -+ if (ret) -+ bch_err_fn(c, ret); -+ if (!ret && trans_was_restarted(trans, restart_count)) -+ ret = -BCH_ERR_transaction_restart_nested; -+ return ret; -+} -+ -+static int check_dirent_target(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c_dirent d, -+ struct bch_inode_unpacked *target, -+ u32 target_snapshot) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_dirent *n; -+ bool backpointer_exists = true; -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (!target->bi_dir && -+ !target->bi_dir_offset) { -+ target->bi_dir = d.k->p.inode; -+ target->bi_dir_offset = d.k->p.offset; -+ -+ ret = __write_inode(trans, target, target_snapshot); -+ if (ret) -+ goto err; -+ } -+ -+ if (!inode_points_to_dirent(target, d)) { -+ ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); -+ if (ret < 0) -+ goto err; -+ -+ backpointer_exists = ret; -+ ret = 0; -+ -+ if (fsck_err_on(S_ISDIR(target->bi_mode) && -+ backpointer_exists, c, -+ "directory %llu with multiple links", -+ target->bi_inum)) { -+ ret = __remove_dirent(trans, d.k->p); -+ goto out; -+ } -+ -+ if (fsck_err_on(backpointer_exists && -+ !target->bi_nlink, c, -+ "inode %llu type %s has multiple links but i_nlink 0", -+ target->bi_inum, bch2_d_types[d.v->d_type])) { -+ target->bi_nlink++; -+ target->bi_flags &= ~BCH_INODE_UNLINKED; -+ -+ ret = __write_inode(trans, target, target_snapshot); -+ if (ret) -+ goto err; -+ } -+ -+ if (fsck_err_on(!backpointer_exists, c, -+ "inode %llu:%u has wrong backpointer:\n" -+ "got %llu:%llu\n" -+ "should be %llu:%llu", -+ target->bi_inum, target_snapshot, -+ target->bi_dir, -+ target->bi_dir_offset, -+ d.k->p.inode, -+ d.k->p.offset)) { -+ target->bi_dir = d.k->p.inode; -+ target->bi_dir_offset = d.k->p.offset; -+ -+ ret = __write_inode(trans, target, target_snapshot); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ if (fsck_err_on(d.v->d_type != inode_d_type(target), c, -+ "incorrect d_type: got %s, should be %s:\n%s", -+ bch2_d_type_str(d.v->d_type), -+ bch2_d_type_str(inode_d_type(target)), -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { -+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_type = inode_d_type(target); -+ -+ ret = bch2_trans_update(trans, iter, &n->k_i, 0); -+ if (ret) -+ goto err; -+ -+ d = dirent_i_to_s_c(n); -+ } -+ -+ if (d.v->d_type == DT_SUBVOL && -+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && -+ (c->sb.version < bcachefs_metadata_version_subvol_dirent || -+ fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", -+ le32_to_cpu(d.v->d_parent_subvol), -+ target->bi_parent_subvol))) { -+ n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); -+ -+ ret = bch2_trans_update(trans, iter, &n->k_i, 0); -+ if (ret) -+ goto err; -+ -+ d = dirent_i_to_s_c(n); -+ } -+out: -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bch_hash_info *hash_info, -+ struct inode_walker *dir, -+ struct inode_walker *target, -+ struct snapshots_seen *s) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c_dirent d; -+ struct inode_walker_entry *i; -+ struct printbuf buf = PRINTBUF; -+ struct bpos equiv; -+ int ret = 0; -+ -+ ret = check_key_has_snapshot(trans, iter, k); -+ if (ret) { -+ ret = ret < 0 ? ret : 0; -+ goto out; -+ } -+ -+ equiv = k.k->p; -+ equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); -+ if (ret) -+ goto err; -+ -+ if (k.k->type == KEY_TYPE_whiteout) -+ goto out; -+ -+ if (dir->last_pos.inode != k.k->p.inode) { -+ ret = check_subdir_count(trans, dir); -+ if (ret) -+ goto err; -+ } -+ -+ BUG_ON(!iter->path->should_be_locked); -+ -+ i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); -+ ret = PTR_ERR_OR_ZERO(i); -+ if (ret < 0) -+ goto err; -+ -+ if (dir->first_this_inode && dir->inodes.nr) -+ *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); -+ dir->first_this_inode = false; -+ -+ if (fsck_err_on(!i, c, -+ "dirent in nonexisting directory:\n%s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ goto out; -+ } -+ -+ if (!i) -+ goto out; -+ -+ if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, -+ "dirent in non directory inode type %s:\n%s", -+ bch2_d_type_str(inode_d_type(&i->inode)), -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, 0); -+ goto out; -+ } -+ -+ ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); -+ if (ret < 0) -+ goto err; -+ if (ret) { -+ /* dirent has been deleted */ -+ ret = 0; -+ goto out; -+ } -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ goto out; -+ -+ d = bkey_s_c_to_dirent(k); -+ -+ if (d.v->d_type == DT_SUBVOL) { -+ struct bch_inode_unpacked subvol_root; -+ u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); -+ u32 target_snapshot; -+ u64 target_inum; -+ -+ ret = __subvol_lookup(trans, target_subvol, -+ &target_snapshot, &target_inum); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret, c, -+ "dirent points to missing subvolume %u", -+ le32_to_cpu(d.v->d_child_subvol))) { -+ ret = __remove_dirent(trans, d.k->p); -+ goto err; -+ } -+ -+ ret = __lookup_inode(trans, target_inum, -+ &subvol_root, &target_snapshot); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret, c, -+ "subvolume %u points to missing subvolume root %llu", -+ target_subvol, -+ target_inum)) { -+ bch_err(c, "repair not implemented yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, -+ "subvol root %llu has wrong bi_subvol field: got %u, should be %u", -+ target_inum, -+ subvol_root.bi_subvol, target_subvol)) { -+ subvol_root.bi_subvol = target_subvol; -+ ret = __write_inode(trans, &subvol_root, target_snapshot); -+ if (ret) -+ goto err; -+ } -+ -+ ret = check_dirent_target(trans, iter, d, &subvol_root, -+ target_snapshot); -+ if (ret) -+ goto err; -+ } else { -+ ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); -+ if (ret) -+ goto err; -+ -+ if (fsck_err_on(!target->inodes.nr, c, -+ "dirent points to missing inode: (equiv %u)\n%s", -+ equiv.snapshot, -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, k), -+ buf.buf))) { -+ ret = __remove_dirent(trans, d.k->p); -+ if (ret) -+ goto err; -+ } -+ -+ darray_for_each(target->inodes, i) { -+ ret = check_dirent_target(trans, iter, d, -+ &i->inode, i->snapshot); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ if (d.v->d_type == DT_DIR) -+ for_each_visible_inode(c, s, dir, equiv.snapshot, i) -+ i->count++; -+ -+out: -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, -+ * validate d_type -+ */ -+int bch2_check_dirents(struct bch_fs *c) -+{ -+ struct inode_walker dir = inode_walker_init(); -+ struct inode_walker target = inode_walker_init(); -+ struct snapshots_seen s; -+ struct bch_hash_info hash_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ snapshots_seen_init(&s); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, -+ k, -+ NULL, NULL, -+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); -+ -+ bch2_trans_exit(&trans); -+ snapshots_seen_exit(&s); -+ inode_walker_exit(&dir); -+ inode_walker_exit(&target); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bch_hash_info *hash_info, -+ struct inode_walker *inode) -+{ -+ struct bch_fs *c = trans->c; -+ struct inode_walker_entry *i; -+ int ret; -+ -+ ret = check_key_has_snapshot(trans, iter, k); -+ if (ret) -+ return ret; -+ -+ i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); -+ ret = PTR_ERR_OR_ZERO(i); -+ if (ret) -+ return ret; -+ -+ if (inode->first_this_inode && inode->inodes.nr) -+ *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); -+ inode->first_this_inode = false; -+ -+ if (fsck_err_on(!i, c, -+ "xattr for missing inode %llu", -+ k.k->p.inode)) -+ return bch2_btree_delete_at(trans, iter, 0); -+ -+ if (!i) -+ return 0; -+ -+ ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); -+fsck_err: -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * Walk xattrs: verify that they all have a corresponding inode -+ */ -+int bch2_check_xattrs(struct bch_fs *c) -+{ -+ struct inode_walker inode = inode_walker_init(); -+ struct bch_hash_info hash_info; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, -+ k, -+ NULL, NULL, -+ BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_xattr(&trans, &iter, k, &hash_info, &inode)); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int check_root_trans(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked root_inode; -+ u32 snapshot; -+ u64 inum; -+ int ret; -+ -+ ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ return ret; -+ -+ if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { -+ struct bkey_i_subvolume root_subvol; -+ -+ snapshot = U32_MAX; -+ inum = BCACHEFS_ROOT_INO; -+ -+ bkey_subvolume_init(&root_subvol.k_i); -+ root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; -+ root_subvol.v.flags = 0; -+ root_subvol.v.snapshot = cpu_to_le32(snapshot); -+ root_subvol.v.inode = cpu_to_le64(inum); -+ ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __bch2_btree_insert(trans, BTREE_ID_subvolumes, -+ &root_subvol.k_i, 0)); -+ if (ret) { -+ bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ } -+ -+ ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ return ret; -+ -+ if (mustfix_fsck_err_on(ret, c, "root directory missing") || -+ mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, -+ "root inode not a directory")) { -+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, -+ 0, NULL); -+ root_inode.bi_inum = inum; -+ -+ ret = __write_inode(trans, &root_inode, snapshot); -+ if (ret) -+ bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); -+ } -+err: -+fsck_err: -+ return ret; -+} -+ -+/* Get root directory, create if it doesn't exist: */ -+int bch2_check_root(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ check_root_trans(&trans)); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+struct pathbuf_entry { -+ u64 inum; -+ u32 snapshot; -+}; -+ -+typedef DARRAY(struct pathbuf_entry) pathbuf; -+ -+static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) -+{ -+ struct pathbuf_entry *i; -+ -+ darray_for_each(*p, i) -+ if (i->inum == inum && -+ i->snapshot == snapshot) -+ return true; -+ -+ return false; -+} -+ -+static int path_down(struct bch_fs *c, pathbuf *p, -+ u64 inum, u32 snapshot) -+{ -+ int ret = darray_push(p, ((struct pathbuf_entry) { -+ .inum = inum, -+ .snapshot = snapshot, -+ })); -+ -+ if (ret) -+ bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", -+ p->size); -+ return ret; -+} -+ -+/* -+ * Check that a given inode is reachable from the root: -+ * -+ * XXX: we should also be verifying that inodes are in the right subvolumes -+ */ -+static int check_path(struct btree_trans *trans, -+ pathbuf *p, -+ struct bch_inode_unpacked *inode, -+ u32 snapshot) -+{ -+ struct bch_fs *c = trans->c; -+ int ret = 0; -+ -+ snapshot = bch2_snapshot_equiv(c, snapshot); -+ p->nr = 0; -+ -+ while (!(inode->bi_inum == BCACHEFS_ROOT_INO && -+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { -+ struct btree_iter dirent_iter; -+ struct bkey_s_c_dirent d; -+ u32 parent_snapshot = snapshot; -+ -+ if (inode->bi_subvol) { -+ u64 inum; -+ -+ ret = subvol_lookup(trans, inode->bi_parent_subvol, -+ &parent_snapshot, &inum); -+ if (ret) -+ break; -+ } -+ -+ ret = lockrestart_do(trans, -+ PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, -+ SPOS(inode->bi_dir, inode->bi_dir_offset, -+ parent_snapshot))).k)); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ break; -+ -+ if (!ret && !dirent_points_to_inode(d, inode)) { -+ bch2_trans_iter_exit(trans, &dirent_iter); -+ ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; -+ } -+ -+ if (bch2_err_matches(ret, ENOENT)) { -+ if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", -+ inode->bi_inum, snapshot, -+ bch2_d_type_str(inode_d_type(inode)), -+ inode->bi_nlink, -+ inode->bi_dir, -+ inode->bi_dir_offset)) -+ ret = reattach_inode(trans, inode, snapshot); -+ break; -+ } -+ -+ bch2_trans_iter_exit(trans, &dirent_iter); -+ -+ if (!S_ISDIR(inode->bi_mode)) -+ break; -+ -+ ret = path_down(c, p, inode->bi_inum, snapshot); -+ if (ret) { -+ bch_err(c, "memory allocation failure"); -+ return ret; -+ } -+ -+ snapshot = parent_snapshot; -+ -+ ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); -+ if (ret) { -+ /* Should have been caught in dirents pass */ -+ bch_err(c, "error looking up parent directory: %i", ret); -+ break; -+ } -+ -+ if (path_is_dup(p, inode->bi_inum, snapshot)) { -+ struct pathbuf_entry *i; -+ -+ /* XXX print path */ -+ bch_err(c, "directory structure loop"); -+ -+ darray_for_each(*p, i) -+ pr_err("%llu:%u", i->inum, i->snapshot); -+ pr_err("%llu:%u", inode->bi_inum, snapshot); -+ -+ if (!fsck_err(c, "directory structure loop")) -+ return 0; -+ -+ ret = commit_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ remove_backpointer(trans, inode)); -+ if (ret) { -+ bch_err(c, "error removing dirent: %i", ret); -+ break; -+ } -+ -+ ret = reattach_inode(trans, inode, snapshot); -+ } -+ } -+fsck_err: -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * Check for unreachable inodes, as well as loops in the directory structure: -+ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's -+ * unreachable: -+ */ -+int bch2_check_directory_structure(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_inode_unpacked u; -+ pathbuf path = { 0, }; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ if (!bkey_is_inode(k.k)) -+ continue; -+ -+ ret = bch2_inode_unpack(k, &u); -+ if (ret) { -+ /* Should have been caught earlier in fsck: */ -+ bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); -+ break; -+ } -+ -+ if (u.bi_flags & BCH_INODE_UNLINKED) -+ continue; -+ -+ ret = check_path(&trans, &path, &u, iter.pos.snapshot); -+ if (ret) -+ break; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ darray_exit(&path); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+struct nlink_table { -+ size_t nr; -+ size_t size; -+ -+ struct nlink { -+ u64 inum; -+ u32 snapshot; -+ u32 count; -+ } *d; -+}; -+ -+static int add_nlink(struct bch_fs *c, struct nlink_table *t, -+ u64 inum, u32 snapshot) -+{ -+ if (t->nr == t->size) { -+ size_t new_size = max_t(size_t, 128UL, t->size * 2); -+ void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); -+ -+ if (!d) { -+ bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", -+ new_size); -+ return -BCH_ERR_ENOMEM_fsck_add_nlink; -+ } -+ -+ if (t->d) -+ memcpy(d, t->d, t->size * sizeof(t->d[0])); -+ kvfree(t->d); -+ -+ t->d = d; -+ t->size = new_size; -+ } -+ -+ -+ t->d[t->nr++] = (struct nlink) { -+ .inum = inum, -+ .snapshot = snapshot, -+ }; -+ -+ return 0; -+} -+ -+static int nlink_cmp(const void *_l, const void *_r) -+{ -+ const struct nlink *l = _l; -+ const struct nlink *r = _r; -+ -+ return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); -+} -+ -+static void inc_link(struct bch_fs *c, struct snapshots_seen *s, -+ struct nlink_table *links, -+ u64 range_start, u64 range_end, u64 inum, u32 snapshot) -+{ -+ struct nlink *link, key = { -+ .inum = inum, .snapshot = U32_MAX, -+ }; -+ -+ if (inum < range_start || inum >= range_end) -+ return; -+ -+ link = __inline_bsearch(&key, links->d, links->nr, -+ sizeof(links->d[0]), nlink_cmp); -+ if (!link) -+ return; -+ -+ while (link > links->d && link[0].inum == link[-1].inum) -+ --link; -+ -+ for (; link < links->d + links->nr && link->inum == inum; link++) -+ if (ref_visible(c, s, snapshot, link->snapshot)) { -+ link->count++; -+ if (link->snapshot >= snapshot) -+ break; -+ } -+} -+ -+noinline_for_stack -+static int check_nlinks_find_hardlinks(struct bch_fs *c, -+ struct nlink_table *t, -+ u64 start, u64 *end) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_inode_unpacked u; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_inodes, -+ POS(0, start), -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ if (!bkey_is_inode(k.k)) -+ continue; -+ -+ /* Should never fail, checked by bch2_inode_invalid: */ -+ BUG_ON(bch2_inode_unpack(k, &u)); -+ -+ /* -+ * Backpointer and directory structure checks are sufficient for -+ * directories, since they can't have hardlinks: -+ */ -+ if (S_ISDIR(u.bi_mode)) -+ continue; -+ -+ if (!u.bi_nlink) -+ continue; -+ -+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); -+ if (ret) { -+ *end = k.k->p.offset; -+ ret = 0; -+ break; -+ } -+ -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); -+ -+ return ret; -+} -+ -+noinline_for_stack -+static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, -+ u64 range_start, u64 range_end) -+{ -+ struct btree_trans trans; -+ struct snapshots_seen s; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ int ret; -+ -+ snapshots_seen_init(&s); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, -+ BTREE_ITER_INTENT| -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); -+ if (ret) -+ break; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_dirent: -+ d = bkey_s_c_to_dirent(k); -+ -+ if (d.v->d_type != DT_DIR && -+ d.v->d_type != DT_SUBVOL) -+ inc_link(c, &s, links, range_start, range_end, -+ le64_to_cpu(d.v->d_inum), -+ bch2_snapshot_equiv(c, d.k->p.snapshot)); -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) -+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); -+ -+ bch2_trans_exit(&trans); -+ snapshots_seen_exit(&s); -+ return ret; -+} -+ -+static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct nlink_table *links, -+ size_t *idx, u64 range_end) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ struct nlink *link = &links->d[*idx]; -+ int ret = 0; -+ -+ if (k.k->p.offset >= range_end) -+ return 1; -+ -+ if (!bkey_is_inode(k.k)) -+ return 0; -+ -+ BUG_ON(bch2_inode_unpack(k, &u)); -+ -+ if (S_ISDIR(u.bi_mode)) -+ return 0; -+ -+ if (!u.bi_nlink) -+ return 0; -+ -+ while ((cmp_int(link->inum, k.k->p.offset) ?: -+ cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { -+ BUG_ON(*idx == links->nr); -+ link = &links->d[++*idx]; -+ } -+ -+ if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, -+ "inode %llu type %s has wrong i_nlink (%u, should be %u)", -+ u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], -+ bch2_inode_nlink_get(&u), link->count)) { -+ bch2_inode_nlink_set(&u, link->count); -+ ret = __write_inode(trans, &u, k.k->p.snapshot); -+ } -+fsck_err: -+ return ret; -+} -+ -+noinline_for_stack -+static int check_nlinks_update_hardlinks(struct bch_fs *c, -+ struct nlink_table *links, -+ u64 range_start, u64 range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ size_t idx = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, -+ POS(0, range_start), -+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret < 0) { -+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_check_nlinks(struct bch_fs *c) -+{ -+ struct nlink_table links = { 0 }; -+ u64 this_iter_range_start, next_iter_range_start = 0; -+ int ret = 0; -+ -+ do { -+ this_iter_range_start = next_iter_range_start; -+ next_iter_range_start = U64_MAX; -+ -+ ret = check_nlinks_find_hardlinks(c, &links, -+ this_iter_range_start, -+ &next_iter_range_start); -+ -+ ret = check_nlinks_walk_dirents(c, &links, -+ this_iter_range_start, -+ next_iter_range_start); -+ if (ret) -+ break; -+ -+ ret = check_nlinks_update_hardlinks(c, &links, -+ this_iter_range_start, -+ next_iter_range_start); -+ if (ret) -+ break; -+ -+ links.nr = 0; -+ } while (next_iter_range_start != U64_MAX); -+ -+ kvfree(links.d); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p; -+ struct bkey_i_reflink_p *u; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_reflink_p) -+ return 0; -+ -+ p = bkey_s_c_to_reflink_p(k); -+ -+ if (!p.v->front_pad && !p.v->back_pad) -+ return 0; -+ -+ u = bch2_trans_kmalloc(trans, sizeof(*u)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(&u->k_i, k); -+ u->v.front_pad = 0; -+ u->v.back_pad = 0; -+ -+ return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); -+} -+ -+int bch2_fix_reflink_p(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) -+ return 0; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_extents, POS_MIN, -+ BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ fix_reflink_p_key(&trans, &iter, k))); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h -new file mode 100644 -index 000000000..90c87b508 ---- /dev/null -+++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,14 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FSCK_H -+#define _BCACHEFS_FSCK_H -+ -+int bch2_check_inodes(struct bch_fs *); -+int bch2_check_extents(struct bch_fs *); -+int bch2_check_dirents(struct bch_fs *); -+int bch2_check_xattrs(struct bch_fs *); -+int bch2_check_root(struct bch_fs *); -+int bch2_check_directory_structure(struct bch_fs *); -+int bch2_check_nlinks(struct bch_fs *); -+int bch2_fix_reflink_p(struct bch_fs *); -+ -+#endif /* _BCACHEFS_FSCK_H */ -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -new file mode 100644 -index 000000000..8114b6e4f ---- /dev/null -+++ b/fs/bcachefs/inode.c -@@ -0,0 +1,1111 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_key_cache.h" -+#include "btree_write_buffer.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "inode.h" -+#include "str_hash.h" -+#include "snapshot.h" -+#include "subvolume.h" -+#include "varint.h" -+ -+#include -+ -+#include -+ -+const char * const bch2_inode_opts[] = { -+#define x(name, ...) #name, -+ BCH_INODE_OPTS() -+#undef x -+ NULL, -+}; -+ -+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -+ -+static int inode_decode_field(const u8 *in, const u8 *end, -+ u64 out[2], unsigned *out_bits) -+{ -+ __be64 be[2] = { 0, 0 }; -+ unsigned bytes, shift; -+ u8 *p; -+ -+ if (in >= end) -+ return -1; -+ -+ if (!*in) -+ return -1; -+ -+ /* -+ * position of highest set bit indicates number of bytes: -+ * shift = number of bits to remove in high byte: -+ */ -+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ -+ bytes = byte_table[shift - 1]; -+ -+ if (in + bytes > end) -+ return -1; -+ -+ p = (u8 *) be + 16 - bytes; -+ memcpy(p, in, bytes); -+ *p ^= (1 << 8) >> shift; -+ -+ out[0] = be64_to_cpu(be[0]); -+ out[1] = be64_to_cpu(be[1]); -+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); -+ -+ return bytes; -+} -+ -+static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) -+{ -+ struct bkey_i_inode_v3 *k = &packed->inode; -+ u8 *out = k->v.fields; -+ u8 *end = (void *) &packed[1]; -+ u8 *last_nonzero_field = out; -+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; -+ unsigned bytes; -+ int ret; -+ -+ bkey_inode_v3_init(&packed->inode.k_i); -+ packed->inode.k.p.offset = inode->bi_inum; -+ packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); -+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; -+ packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); -+ packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); -+ packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); -+ packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); -+ SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); -+ SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); -+ -+ -+#define x(_name, _bits) \ -+ nr_fields++; \ -+ \ -+ if (inode->_name) { \ -+ ret = bch2_varint_encode_fast(out, inode->_name); \ -+ out += ret; \ -+ \ -+ if (_bits > 64) \ -+ *out++ = 0; \ -+ \ -+ last_nonzero_field = out; \ -+ last_nonzero_fieldnr = nr_fields; \ -+ } else { \ -+ *out++ = 0; \ -+ \ -+ if (_bits > 64) \ -+ *out++ = 0; \ -+ } -+ -+ BCH_INODE_FIELDS_v3() -+#undef x -+ BUG_ON(out > end); -+ -+ out = last_nonzero_field; -+ nr_fields = last_nonzero_fieldnr; -+ -+ bytes = out - (u8 *) &packed->inode.v; -+ set_bkey_val_bytes(&packed->inode.k, bytes); -+ memset_u64s_tail(&packed->inode.v, 0, bytes); -+ -+ SET_INODEv3_NR_FIELDS(&k->v, nr_fields); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { -+ struct bch_inode_unpacked unpacked; -+ -+ int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), -+ &unpacked); -+ BUG_ON(ret); -+ BUG_ON(unpacked.bi_inum != inode->bi_inum); -+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); -+ BUG_ON(unpacked.bi_sectors != inode->bi_sectors); -+ BUG_ON(unpacked.bi_size != inode->bi_size); -+ BUG_ON(unpacked.bi_version != inode->bi_version); -+ BUG_ON(unpacked.bi_mode != inode->bi_mode); -+ -+#define x(_name, _bits) if (unpacked._name != inode->_name) \ -+ panic("unpacked %llu should be %llu", \ -+ (u64) unpacked._name, (u64) inode->_name); -+ BCH_INODE_FIELDS_v3() -+#undef x -+ } -+} -+ -+void bch2_inode_pack(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) -+{ -+ bch2_inode_pack_inlined(packed, inode); -+} -+ -+static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, -+ struct bch_inode_unpacked *unpacked) -+{ -+ const u8 *in = inode.v->fields; -+ const u8 *end = bkey_val_end(inode); -+ u64 field[2]; -+ unsigned fieldnr = 0, field_bits; -+ int ret; -+ -+#define x(_name, _bits) \ -+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ -+ unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ -+ memset((void *) unpacked + offset, 0, \ -+ sizeof(*unpacked) - offset); \ -+ return 0; \ -+ } \ -+ \ -+ ret = inode_decode_field(in, end, field, &field_bits); \ -+ if (ret < 0) \ -+ return ret; \ -+ \ -+ if (field_bits > sizeof(unpacked->_name) * 8) \ -+ return -1; \ -+ \ -+ unpacked->_name = field[1]; \ -+ in += ret; -+ -+ BCH_INODE_FIELDS_v2() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ return 0; -+} -+ -+static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, -+ const u8 *in, const u8 *end, -+ unsigned nr_fields) -+{ -+ unsigned fieldnr = 0; -+ int ret; -+ u64 v[2]; -+ -+#define x(_name, _bits) \ -+ if (fieldnr < nr_fields) { \ -+ ret = bch2_varint_decode_fast(in, end, &v[0]); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ \ -+ if (_bits > 64) { \ -+ ret = bch2_varint_decode_fast(in, end, &v[1]); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ } else { \ -+ v[1] = 0; \ -+ } \ -+ } else { \ -+ v[0] = v[1] = 0; \ -+ } \ -+ \ -+ unpacked->_name = v[0]; \ -+ if (v[1] || v[0] != unpacked->_name) \ -+ return -1; \ -+ fieldnr++; -+ -+ BCH_INODE_FIELDS_v2() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ return 0; -+} -+ -+static int bch2_inode_unpack_v3(struct bkey_s_c k, -+ struct bch_inode_unpacked *unpacked) -+{ -+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); -+ const u8 *in = inode.v->fields; -+ const u8 *end = bkey_val_end(inode); -+ unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); -+ unsigned fieldnr = 0; -+ int ret; -+ u64 v[2]; -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); -+ unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); -+ unpacked->bi_size = le64_to_cpu(inode.v->bi_size); -+ unpacked->bi_version = le64_to_cpu(inode.v->bi_version); -+ unpacked->bi_mode = INODEv3_MODE(inode.v); -+ -+#define x(_name, _bits) \ -+ if (fieldnr < nr_fields) { \ -+ ret = bch2_varint_decode_fast(in, end, &v[0]); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ \ -+ if (_bits > 64) { \ -+ ret = bch2_varint_decode_fast(in, end, &v[1]); \ -+ if (ret < 0) \ -+ return ret; \ -+ in += ret; \ -+ } else { \ -+ v[1] = 0; \ -+ } \ -+ } else { \ -+ v[0] = v[1] = 0; \ -+ } \ -+ \ -+ unpacked->_name = v[0]; \ -+ if (v[1] || v[0] != unpacked->_name) \ -+ return -1; \ -+ fieldnr++; -+ -+ BCH_INODE_FIELDS_v3() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ return 0; -+} -+ -+static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, -+ struct bch_inode_unpacked *unpacked) -+{ -+ memset(unpacked, 0, sizeof(*unpacked)); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: { -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_journal_seq= 0; -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ -+ if (INODE_NEW_VARINT(inode.v)) { -+ return bch2_inode_unpack_v2(unpacked, inode.v->fields, -+ bkey_val_end(inode), -+ INODE_NR_FIELDS(inode.v)); -+ } else { -+ return bch2_inode_unpack_v1(inode, unpacked); -+ } -+ break; -+ } -+ case KEY_TYPE_inode_v2: { -+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ -+ return bch2_inode_unpack_v2(unpacked, inode.v->fields, -+ bkey_val_end(inode), -+ INODEv2_NR_FIELDS(inode.v)); -+ } -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_inode_unpack(struct bkey_s_c k, -+ struct bch_inode_unpacked *unpacked) -+{ -+ if (likely(k.k->type == KEY_TYPE_inode_v3)) -+ return bch2_inode_unpack_v3(k, unpacked); -+ return bch2_inode_unpack_slowpath(k, unpacked); -+} -+ -+int bch2_inode_peek(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode, -+ subvol_inum inum, unsigned flags) -+{ -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, -+ SPOS(0, inum.inum, snapshot), -+ flags|BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_unpack(k, inode); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ bch2_trans_iter_exit(trans, iter); -+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_msg(trans->c, ret, "looking up inum %u:%llu:", inum.subvol, inum.inum); -+ return ret; -+} -+ -+int bch2_inode_write(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode) -+{ -+ struct bkey_inode_buf *inode_p; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+ -+ bch2_inode_pack_inlined(inode_p, inode); -+ inode_p->inode.k.p.snapshot = iter->snapshot; -+ return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+} -+ -+struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) -+{ -+ struct bch_inode_unpacked u; -+ struct bkey_inode_buf *inode_p; -+ int ret; -+ -+ if (!bkey_is_inode(&k->k)) -+ return ERR_PTR(-ENOENT); -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return ERR_CAST(inode_p); -+ -+ ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ bch2_inode_pack(inode_p, &u); -+ return &inode_p->inode.k_i; -+} -+ -+static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) -+{ -+ struct bch_inode_unpacked unpacked; -+ -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero k.p.inode"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) { -+ prt_printf(err, "fs inode in blockdev range"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (bch2_inode_unpack(k, &unpacked)) { -+ prt_printf(err, "invalid variable length fields"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { -+ prt_printf(err, "invalid data checksum type (%u >= %u", -+ unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { -+ prt_printf(err, "invalid data checksum type (%u >= %u)", -+ unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && -+ unpacked.bi_nlink != 0) { -+ prt_printf(err, "flagged as unlinked but bi_nlink != 0"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { -+ prt_printf(err, "subvolume root but not a directory"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ -+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return __bch2_inode_invalid(k, err); -+} -+ -+int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); -+ -+ if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return __bch2_inode_invalid(k, err); -+} -+ -+int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); -+ -+ if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || -+ INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { -+ prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", -+ INODEv3_FIELDS_START(inode.v), -+ INODEv3_FIELDS_START_INITIAL, -+ bkey_val_u64s(inode.k)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { -+ prt_printf(err, "invalid str hash type (%llu >= %u)", -+ INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return __bch2_inode_invalid(k, err); -+} -+ -+static void __bch2_inode_unpacked_to_text(struct printbuf *out, -+ struct bch_inode_unpacked *inode) -+{ -+ prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", -+ inode->bi_mode, inode->bi_flags, -+ inode->bi_journal_seq, -+ inode->bi_size, -+ inode->bi_sectors, -+ inode->bi_version); -+ -+#define x(_name, _bits) \ -+ prt_printf(out, " "#_name " %llu", (u64) inode->_name); -+ BCH_INODE_FIELDS_v3() -+#undef x -+} -+ -+void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) -+{ -+ prt_printf(out, "inum: %llu ", inode->bi_inum); -+ __bch2_inode_unpacked_to_text(out, inode); -+} -+ -+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bch_inode_unpacked inode; -+ -+ if (bch2_inode_unpack(k, &inode)) { -+ prt_printf(out, "(unpack error)"); -+ return; -+ } -+ -+ __bch2_inode_unpacked_to_text(out, &inode); -+} -+ -+static inline u64 bkey_inode_flags(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); -+ case KEY_TYPE_inode_v2: -+ return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); -+ case KEY_TYPE_inode_v3: -+ return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); -+ default: -+ return 0; -+ } -+} -+ -+static inline bool bkey_is_deleted_inode(struct bkey_s_c k) -+{ -+ return bkey_inode_flags(k) & BCH_INODE_UNLINKED; -+} -+ -+int bch2_trans_mark_inode(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) -+{ -+ int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); -+ bool old_deleted = bkey_is_deleted_inode(old); -+ bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new)); -+ -+ if (nr) { -+ int ret = bch2_replicas_deltas_realloc(trans, 0); -+ struct replicas_delta_list *d = trans->fs_usage_deltas; -+ -+ if (ret) -+ return ret; -+ -+ d->nr_inodes += nr; -+ } -+ -+ if (old_deleted != new_deleted) { -+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_mark_inode(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_fs_usage *fs_usage; -+ u64 journal_seq = trans->journal_res.seq; -+ -+ if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; -+ -+ BUG_ON(!journal_seq); -+ BUG_ON(new.k->type != KEY_TYPE_inode_v3); -+ -+ v->bi_journal_seq = cpu_to_le64(journal_seq); -+ } -+ -+ if (flags & BTREE_TRIGGER_GC) { -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ -+ fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); -+ fs_usage->nr_inodes += bkey_is_inode(new.k); -+ fs_usage->nr_inodes -= bkey_is_inode(old.k); -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ } -+ return 0; -+} -+ -+int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (k.k->p.inode) { -+ prt_printf(err, "nonzero k.p.inode"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); -+ -+ prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -+} -+ -+void bch2_inode_init_early(struct bch_fs *c, -+ struct bch_inode_unpacked *inode_u) -+{ -+ enum bch_str_hash_type str_hash = -+ bch2_str_hash_opt_to_type(c, c->opts.str_hash); -+ -+ memset(inode_u, 0, sizeof(*inode_u)); -+ -+ /* ick */ -+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; -+ get_random_bytes(&inode_u->bi_hash_seed, -+ sizeof(inode_u->bi_hash_seed)); -+} -+ -+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ inode_u->bi_mode = mode; -+ inode_u->bi_uid = uid; -+ inode_u->bi_gid = gid; -+ inode_u->bi_dev = rdev; -+ inode_u->bi_atime = now; -+ inode_u->bi_mtime = now; -+ inode_u->bi_ctime = now; -+ inode_u->bi_otime = now; -+ -+ if (parent && parent->bi_mode & S_ISGID) { -+ inode_u->bi_gid = parent->bi_gid; -+ if (S_ISDIR(mode)) -+ inode_u->bi_mode |= S_ISGID; -+ } -+ -+ if (parent) { -+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ } -+} -+ -+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ bch2_inode_init_early(c, inode_u); -+ bch2_inode_init_late(inode_u, bch2_current_time(c), -+ uid, gid, mode, rdev, parent); -+} -+ -+static inline u32 bkey_generation(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ case KEY_TYPE_inode_v2: -+ BUG(); -+ case KEY_TYPE_inode_generation: -+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); -+ default: -+ return 0; -+ } -+} -+ -+/* -+ * This just finds an empty slot: -+ */ -+int bch2_inode_create(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode_u, -+ u32 snapshot, u64 cpu) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ u64 min, max, start, pos, *hint; -+ int ret = 0; -+ unsigned bits = (c->opts.inodes_32bit ? 31 : 63); -+ -+ if (c->opts.shard_inode_numbers) { -+ bits -= c->inode_shard_bits; -+ -+ min = (cpu << bits); -+ max = (cpu << bits) | ~(ULLONG_MAX << bits); -+ -+ min = max_t(u64, min, BLOCKDEV_INODE_MAX); -+ hint = c->unused_inode_hints + cpu; -+ } else { -+ min = BLOCKDEV_INODE_MAX; -+ max = ~(ULLONG_MAX << bits); -+ hint = c->unused_inode_hints; -+ } -+ -+ start = READ_ONCE(*hint); -+ -+ if (start >= max || start < min) -+ start = min; -+ -+ pos = start; -+ bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), -+ BTREE_ITER_ALL_SNAPSHOTS| -+ BTREE_ITER_INTENT); -+again: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_lt(k.k->p, POS(0, max))) { -+ if (pos < iter->pos.offset) -+ goto found_slot; -+ -+ /* -+ * We don't need to iterate over keys in every snapshot once -+ * we've found just one: -+ */ -+ pos = iter->pos.offset + 1; -+ bch2_btree_iter_set_pos(iter, POS(0, pos)); -+ } -+ -+ if (!ret && pos < max) -+ goto found_slot; -+ -+ if (!ret && start == min) -+ ret = -BCH_ERR_ENOSPC_inode_create; -+ -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+ } -+ -+ /* Retry from start */ -+ pos = start = min; -+ bch2_btree_iter_set_pos(iter, POS(0, pos)); -+ goto again; -+found_slot: -+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_trans_iter_exit(trans, iter); -+ return ret; -+ } -+ -+ *hint = k.k->p.offset; -+ inode_u->bi_inum = k.k->p.offset; -+ inode_u->bi_generation = bkey_generation(k); -+ return 0; -+} -+ -+static int bch2_inode_delete_keys(struct btree_trans *trans, -+ subvol_inum inum, enum btree_id id) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_i delete; -+ u32 snapshot; -+ int ret = 0; -+ -+ /* -+ * We're never going to be deleting partial extents, no need to use an -+ * extent iterator: -+ */ -+ bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), -+ BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); -+ -+ while (1) { -+ bch2_trans_begin(trans); -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_btree_iter_set_snapshot(&iter, snapshot); -+ -+ k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k) -+ break; -+ -+ bkey_init(&delete.k); -+ delete.k.p = iter.pos; -+ -+ ret = bch2_trans_update(trans, &iter, &delete, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ break; -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_i_inode_generation delete; -+ struct bch_inode_unpacked inode_u; -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 1024); -+ -+ /* -+ * If this was a directory, there shouldn't be any real dirents left - -+ * but there could be whiteouts (from hash collisions) that we should -+ * delete: -+ * -+ * XXX: the dirent could ideally would delete whiteouts when they're no -+ * longer needed -+ */ -+ ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: -+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: -+ bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); -+ if (ret) -+ goto err; -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inum.inum, snapshot), -+ BTREE_ITER_INTENT|BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!bkey_is_inode(k.k)) { -+ bch2_fs_inconsistent(trans.c, -+ "inode %llu:%u not found when deleting", -+ inum.inum, snapshot); -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_inode_unpack(k, &inode_u); -+ -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p = iter.pos; -+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); -+ -+ ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, -+ subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ ret = bch2_inode_peek(trans, &iter, inode, inum, 0); -+ if (!ret) -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, -+ struct bch_inode_unpacked *inode) -+{ -+ return bch2_trans_do(c, NULL, NULL, 0, -+ bch2_inode_find_by_inum_trans(&trans, inum, inode)); -+} -+ -+int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -+{ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ else { -+ if (bi->bi_nlink == U32_MAX) -+ return -EINVAL; -+ -+ bi->bi_nlink++; -+ } -+ -+ return 0; -+} -+ -+void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) -+{ -+ if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { -+ bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", -+ bi->bi_inum); -+ return; -+ } -+ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) { -+ bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); -+ return; -+ } -+ -+ if (bi->bi_nlink) -+ bi->bi_nlink--; -+ else -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+} -+ -+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) -+{ -+ struct bch_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (inode->bi_##_name) \ -+ opt_set(ret, _name, inode->bi_##_name - 1); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, -+ struct bch_inode_unpacked *inode) -+{ -+#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); -+ BCH_INODE_OPTS() -+#undef x -+ -+ if (opts->nocow) -+ opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; -+} -+ -+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter = { NULL }; -+ struct bkey_i_inode_generation delete; -+ struct bch_inode_unpacked inode_u; -+ struct bkey_s_c k; -+ int ret; -+ -+ do { -+ ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL) ?: -+ bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL) ?: -+ bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, -+ SPOS(inum, 0, snapshot), -+ SPOS(inum, U64_MAX, snapshot), -+ 0, NULL); -+ } while (ret == -BCH_ERR_transaction_restart_nested); -+ if (ret) -+ goto err; -+retry: -+ bch2_trans_begin(trans); -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, inum, snapshot), BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!bkey_is_inode(k.k)) { -+ bch2_fs_inconsistent(c, -+ "inode %llu:%u not found when deleting", -+ inum, snapshot); -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_inode_unpack(k, &inode_u); -+ -+ /* Subvolume root? */ -+ if (inode_u.bi_subvol) -+ bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); -+ -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p = iter.pos; -+ delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); -+ -+ ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ return ret ?: -BCH_ERR_transaction_restart_nested; -+} -+ -+static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_inode_unpacked inode; -+ int ret; -+ -+ if (bch2_snapshot_is_internal_node(c, pos.snapshot)) -+ return 0; -+ -+ if (!fsck_err_on(c->sb.clean, c, -+ "filesystem marked as clean but have deleted inode %llu:%u", -+ pos.offset, pos.snapshot)) -+ return 0; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, pos, BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; -+ if (fsck_err_on(!bkey_is_inode(k.k), c, -+ "nonexistent inode %llu:%u in deleted_inodes btree", -+ pos.offset, pos.snapshot)) -+ goto delete; -+ -+ ret = bch2_inode_unpack(k, &inode); -+ if (ret) -+ goto err; -+ -+ if (fsck_err_on(S_ISDIR(inode.bi_mode), c, -+ "directory %llu:%u in deleted_inodes btree", -+ pos.offset, pos.snapshot)) -+ goto delete; -+ -+ if (fsck_err_on(!(inode.bi_flags & BCH_INODE_UNLINKED), c, -+ "non-deleted inode %llu:%u in deleted_inodes btree", -+ pos.offset, pos.snapshot)) -+ goto delete; -+ -+ return 1; -+err: -+fsck_err: -+ return ret; -+delete: -+ return bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); -+} -+ -+int bch2_delete_dead_inodes(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_btree_write_buffer_flush_sync(&trans); -+ if (ret) -+ goto err; -+ -+ /* -+ * Weird transaction restart handling here because on successful delete, -+ * bch2_inode_rm_snapshot() will return a nested transaction restart, -+ * but we can't retry because the btree write buffer won't have been -+ * flushed and we'd spin: -+ */ -+ for_each_btree_key(&trans, iter, BTREE_ID_deleted_inodes, POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ ret = lockrestart_do(&trans, may_delete_deleted_inode(&trans, k.k->p)); -+ if (ret < 0) -+ break; -+ -+ if (ret) { -+ if (!test_bit(BCH_FS_RW, &c->flags)) { -+ bch2_trans_unlock(&trans); -+ bch2_fs_lazy_rw(c); -+ } -+ -+ ret = bch2_inode_rm_snapshot(&trans, k.k->p.offset, k.k->p.snapshot); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -new file mode 100644 -index 000000000..22b244056 ---- /dev/null -+++ b/fs/bcachefs/inode.h -@@ -0,0 +1,204 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_INODE_H -+#define _BCACHEFS_INODE_H -+ -+#include "bkey.h" -+#include "opts.h" -+ -+enum bkey_invalid_flags; -+extern const char * const bch2_inode_opts[]; -+ -+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_i *, unsigned); -+int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+ -+#define bch2_bkey_ops_inode ((struct bkey_ops) { \ -+ .key_invalid = bch2_inode_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+ .trans_trigger = bch2_trans_mark_inode, \ -+ .atomic_trigger = bch2_mark_inode, \ -+ .min_val_size = 16, \ -+}) -+ -+#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ -+ .key_invalid = bch2_inode_v2_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+ .trans_trigger = bch2_trans_mark_inode, \ -+ .atomic_trigger = bch2_mark_inode, \ -+ .min_val_size = 32, \ -+}) -+ -+#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ -+ .key_invalid = bch2_inode_v3_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+ .trans_trigger = bch2_trans_mark_inode, \ -+ .atomic_trigger = bch2_mark_inode, \ -+ .min_val_size = 48, \ -+}) -+ -+static inline bool bkey_is_inode(const struct bkey *k) -+{ -+ return k->type == KEY_TYPE_inode || -+ k->type == KEY_TYPE_inode_v2 || -+ k->type == KEY_TYPE_inode_v3; -+} -+ -+int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ -+ .key_invalid = bch2_inode_generation_invalid, \ -+ .val_to_text = bch2_inode_generation_to_text, \ -+ .min_val_size = 8, \ -+}) -+ -+#if 0 -+typedef struct { -+ u64 lo; -+ u32 hi; -+} __packed __aligned(4) u96; -+#endif -+typedef u64 u96; -+ -+struct bch_inode_unpacked { -+ u64 bi_inum; -+ u64 bi_journal_seq; -+ __le64 bi_hash_seed; -+ u64 bi_size; -+ u64 bi_sectors; -+ u64 bi_version; -+ u32 bi_flags; -+ u16 bi_mode; -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_FIELDS_v3() -+#undef x -+}; -+ -+struct bkey_inode_buf { -+ struct bkey_i_inode_v3 inode; -+ -+#define x(_name, _bits) + 8 + _bits / 8 -+ u8 _pad[0 + BCH_INODE_FIELDS_v3()]; -+#undef x -+} __packed __aligned(8); -+ -+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -+int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); -+struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); -+ -+void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); -+ -+int bch2_inode_peek(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *, subvol_inum, unsigned); -+int bch2_inode_write(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *); -+ -+void bch2_inode_init_early(struct bch_fs *, -+ struct bch_inode_unpacked *); -+void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_create(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *, u32, u64); -+ -+int bch2_inode_rm(struct bch_fs *, subvol_inum); -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, -+ struct bch_inode_unpacked *); -+ -+#define inode_opt_get(_c, _inode, _name) \ -+ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) -+ -+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ inode->bi_##_name = v; \ -+ break; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ return inode->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline u8 mode_to_type(umode_t mode) -+{ -+ return (mode >> 12) & 15; -+} -+ -+static inline u8 inode_d_type(struct bch_inode_unpacked *inode) -+{ -+ return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); -+} -+ -+/* i_nlink: */ -+ -+static inline unsigned nlink_bias(umode_t mode) -+{ -+ return S_ISDIR(mode) ? 2 : 1; -+} -+ -+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -+{ -+ return bi->bi_flags & BCH_INODE_UNLINKED -+ ? 0 -+ : bi->bi_nlink + nlink_bias(bi->bi_mode); -+} -+ -+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, -+ unsigned nlink) -+{ -+ if (nlink) { -+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ } else { -+ bi->bi_nlink = 0; -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+ } -+} -+ -+int bch2_inode_nlink_inc(struct bch_inode_unpacked *); -+void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -+ -+struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); -+void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); -+int bch2_delete_dead_inodes(struct bch_fs *); -+ -+#endif /* _BCACHEFS_INODE_H */ -diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c -new file mode 100644 -index 000000000..3c614c864 ---- /dev/null -+++ b/fs/bcachefs/io.c -@@ -0,0 +1,3051 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Some low level IO code, and hacks for various block layer limitations -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_buf.h" -+#include "bset.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "compress.h" -+#include "clock.h" -+#include "data_update.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "extent_update.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "nocow_locking.h" -+#include "rebalance.h" -+#include "subvolume.h" -+#include "super.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+#include -+ -+const char *bch2_blk_status_to_str(blk_status_t status) -+{ -+ if (status == BLK_STS_REMOVED) -+ return "device removed"; -+ return blk_status_to_str(status); -+} -+ -+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -+ -+static bool bch2_target_congested(struct bch_fs *c, u16 target) -+{ -+ const struct bch_devs_mask *devs; -+ unsigned d, nr = 0, total = 0; -+ u64 now = local_clock(), last; -+ s64 congested; -+ struct bch_dev *ca; -+ -+ if (!target) -+ return false; -+ -+ rcu_read_lock(); -+ devs = bch2_target_to_mask(c, target) ?: -+ &c->rw_devs[BCH_DATA_user]; -+ -+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { -+ ca = rcu_dereference(c->devs[d]); -+ if (!ca) -+ continue; -+ -+ congested = atomic_read(&ca->congested); -+ last = READ_ONCE(ca->congested_last); -+ if (time_after64(now, last)) -+ congested -= (now - last) >> 12; -+ -+ total += max(congested, 0LL); -+ nr++; -+ } -+ rcu_read_unlock(); -+ -+ return bch2_rand_range(nr * CONGESTED_MAX) < total; -+} -+ -+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, -+ u64 now, int rw) -+{ -+ u64 latency_capable = -+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; -+ /* ideally we'd be taking into account the device's variance here: */ -+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); -+ s64 latency_over = io_latency - latency_threshold; -+ -+ if (latency_threshold && latency_over > 0) { -+ /* -+ * bump up congested by approximately latency_over * 4 / -+ * latency_threshold - we don't need much accuracy here so don't -+ * bother with the divide: -+ */ -+ if (atomic_read(&ca->congested) < CONGESTED_MAX) -+ atomic_add(latency_over >> -+ max_t(int, ilog2(latency_threshold) - 2, 0), -+ &ca->congested); -+ -+ ca->congested_last = now; -+ } else if (atomic_read(&ca->congested) > 0) { -+ atomic_dec(&ca->congested); -+ } -+} -+ -+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -+{ -+ atomic64_t *latency = &ca->cur_latency[rw]; -+ u64 now = local_clock(); -+ u64 io_latency = time_after64(now, submit_time) -+ ? now - submit_time -+ : 0; -+ u64 old, new, v = atomic64_read(latency); -+ -+ do { -+ old = v; -+ -+ /* -+ * If the io latency was reasonably close to the current -+ * latency, skip doing the update and atomic operation - most of -+ * the time: -+ */ -+ if (abs((int) (old - io_latency)) < (old >> 1) && -+ now & ~(~0U << 5)) -+ break; -+ -+ new = ewma_add(old, io_latency, 5); -+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); -+ -+ bch2_congested_acct(ca, io_latency, now, rw); -+ -+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); -+} -+ -+#else -+ -+static bool bch2_target_congested(struct bch_fs *c, u16 target) -+{ -+ return false; -+} -+ -+#endif -+ -+/* Allocate, free from mempool: */ -+ -+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ if (bv->bv_page != ZERO_PAGE(0)) -+ mempool_free(bv->bv_page, &c->bio_bounce_pages); -+ bio->bi_vcnt = 0; -+} -+ -+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -+{ -+ struct page *page; -+ -+ if (likely(!*using_mempool)) { -+ page = alloc_page(GFP_NOFS); -+ if (unlikely(!page)) { -+ mutex_lock(&c->bio_bounce_pages_lock); -+ *using_mempool = true; -+ goto pool_alloc; -+ -+ } -+ } else { -+pool_alloc: -+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); -+ } -+ -+ return page; -+} -+ -+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, -+ size_t size) -+{ -+ bool using_mempool = false; -+ -+ while (size) { -+ struct page *page = __bio_alloc_page_pool(c, &using_mempool); -+ unsigned len = min_t(size_t, PAGE_SIZE, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ if (using_mempool) -+ mutex_unlock(&c->bio_bounce_pages_lock); -+} -+ -+/* Extent update path: */ -+ -+int bch2_sum_sector_overwrites(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i *new, -+ bool *usage_increasing, -+ s64 *i_sectors_delta, -+ s64 *disk_sectors_delta) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c old; -+ unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); -+ bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); -+ int ret = 0; -+ -+ *usage_increasing = false; -+ *i_sectors_delta = 0; -+ *disk_sectors_delta = 0; -+ -+ bch2_trans_copy_iter(&iter, extent_iter); -+ -+ for_each_btree_key_upto_continue_norestart(iter, -+ new->k.p, BTREE_ITER_SLOTS, old, ret) { -+ s64 sectors = min(new->k.p.offset, old.k->p.offset) - -+ max(bkey_start_offset(&new->k), -+ bkey_start_offset(old.k)); -+ -+ *i_sectors_delta += sectors * -+ (bkey_extent_is_allocation(&new->k) - -+ bkey_extent_is_allocation(old.k)); -+ -+ *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); -+ *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot -+ ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) -+ : 0; -+ -+ if (!*usage_increasing && -+ (new->k.p.snapshot != old.k->p.snapshot || -+ new_replicas > bch2_bkey_replicas(c, old) || -+ (!new_compressed && bch2_bkey_sectors_compressed(old)))) -+ *usage_increasing = true; -+ -+ if (bkey_ge(old.k->p, new->k.p)) -+ break; -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ u64 new_i_size, -+ s64 i_sectors_delta) -+{ -+ struct btree_iter iter; -+ struct bkey_i *k; -+ struct bkey_i_inode_v3 *inode; -+ unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; -+ int ret; -+ -+ k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, -+ extent_iter->pos.inode, -+ extent_iter->snapshot), -+ BTREE_ITER_CACHED); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { -+ k = bch2_inode_to_v3(trans, k); -+ ret = PTR_ERR_OR_ZERO(k); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ inode = bkey_i_to_inode_v3(k); -+ -+ if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && -+ new_i_size > le64_to_cpu(inode->v.bi_size)) { -+ inode->v.bi_size = cpu_to_le64(new_i_size); -+ inode_update_flags = 0; -+ } -+ -+ if (i_sectors_delta) { -+ le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); -+ inode_update_flags = 0; -+ } -+ -+ if (inode->k.p.snapshot != iter.snapshot) { -+ inode->k.p.snapshot = iter.snapshot; -+ inode_update_flags = 0; -+ } -+ -+ ret = bch2_trans_update(trans, &iter, &inode->k_i, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| -+ inode_update_flags); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_extent_update(struct btree_trans *trans, -+ subvol_inum inum, -+ struct btree_iter *iter, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 new_i_size, -+ s64 *i_sectors_delta_total, -+ bool check_enospc) -+{ -+ struct bpos next_pos; -+ bool usage_increasing; -+ s64 i_sectors_delta = 0, disk_sectors_delta = 0; -+ int ret; -+ -+ /* -+ * This traverses us the iterator without changing iter->path->pos to -+ * search_key() (which is pos + 1 for extents): we want there to be a -+ * path already traversed at iter->pos because -+ * bch2_trans_extent_update() will use it to attempt extent merging -+ */ -+ ret = __bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ ret = bch2_extent_trim_atomic(trans, iter, k); -+ if (ret) -+ return ret; -+ -+ next_pos = k->k.p; -+ -+ ret = bch2_sum_sector_overwrites(trans, iter, k, -+ &usage_increasing, -+ &i_sectors_delta, -+ &disk_sectors_delta); -+ if (ret) -+ return ret; -+ -+ if (disk_res && -+ disk_sectors_delta > (s64) disk_res->sectors) { -+ ret = bch2_disk_reservation_add(trans->c, disk_res, -+ disk_sectors_delta - disk_res->sectors, -+ !check_enospc || !usage_increasing -+ ? BCH_DISK_RESERVATION_NOFAIL : 0); -+ if (ret) -+ return ret; -+ } -+ -+ /* -+ * Note: -+ * We always have to do an inode update - even when i_size/i_sectors -+ * aren't changing - for fsync to work properly; fsync relies on -+ * inode->bi_journal_seq which is updated by the trigger code: -+ */ -+ ret = bch2_extent_update_i_size_sectors(trans, iter, -+ min(k->k.p.offset << 9, new_i_size), -+ i_sectors_delta) ?: -+ bch2_trans_update(trans, iter, k, 0) ?: -+ bch2_trans_commit(trans, disk_res, NULL, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (i_sectors_delta_total) -+ *i_sectors_delta_total += i_sectors_delta; -+ bch2_btree_iter_set_pos(iter, next_pos); -+ return 0; -+} -+ -+/* Overwrites whatever was present with zeroes: */ -+int bch2_extent_fallocate(struct btree_trans *trans, -+ subvol_inum inum, -+ struct btree_iter *iter, -+ unsigned sectors, -+ struct bch_io_opts opts, -+ s64 *i_sectors_delta, -+ struct write_point_specifier write_point) -+{ -+ struct bch_fs *c = trans->c; -+ struct disk_reservation disk_res = { 0 }; -+ struct closure cl; -+ struct open_buckets open_buckets = { 0 }; -+ struct bkey_s_c k; -+ struct bkey_buf old, new; -+ unsigned sectors_allocated = 0; -+ bool have_reservation = false; -+ bool unwritten = opts.nocow && -+ c->sb.version >= bcachefs_metadata_version_unwritten_extents; -+ int ret; -+ -+ bch2_bkey_buf_init(&old); -+ bch2_bkey_buf_init(&new); -+ closure_init_stack(&cl); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); -+ -+ if (!have_reservation) { -+ unsigned new_replicas = -+ max(0, (int) opts.data_replicas - -+ (int) bch2_bkey_nr_ptrs_fully_allocated(k)); -+ /* -+ * Get a disk reservation before (in the nocow case) calling -+ * into the allocator: -+ */ -+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_bkey_buf_reassemble(&old, c, k); -+ } -+ -+ if (have_reservation) { -+ if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) -+ goto err; -+ -+ bch2_key_resize(&new.k->k, sectors); -+ } else if (!unwritten) { -+ struct bkey_i_reservation *reservation; -+ -+ bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); -+ reservation = bkey_reservation_init(new.k); -+ reservation->k.p = iter->pos; -+ bch2_key_resize(&reservation->k, sectors); -+ reservation->v.nr_replicas = opts.data_replicas; -+ } else { -+ struct bkey_i_extent *e; -+ struct bch_devs_list devs_have; -+ struct write_point *wp; -+ struct bch_extent_ptr *ptr; -+ -+ devs_have.nr = 0; -+ -+ bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); -+ -+ e = bkey_extent_init(new.k); -+ e->k.p = iter->pos; -+ -+ ret = bch2_alloc_sectors_start_trans(trans, -+ opts.foreground_target, -+ false, -+ write_point, -+ &devs_have, -+ opts.data_replicas, -+ opts.data_replicas, -+ BCH_WATERMARK_normal, 0, &cl, &wp); -+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) -+ ret = -BCH_ERR_transaction_restart_nested; -+ if (ret) -+ goto err; -+ -+ sectors = min(sectors, wp->sectors_free); -+ sectors_allocated = sectors; -+ -+ bch2_key_resize(&e->k, sectors); -+ -+ bch2_open_bucket_get(c, wp, &open_buckets); -+ bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); -+ bch2_alloc_sectors_done(c, wp); -+ -+ extent_for_each_ptr(extent_i_to_s(e), ptr) -+ ptr->unwritten = true; -+ } -+ -+ have_reservation = true; -+ -+ ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, -+ 0, i_sectors_delta, true); -+err: -+ if (!ret && sectors_allocated) -+ bch2_increment_clock(c, sectors_allocated, WRITE); -+ -+ bch2_open_buckets_put(c, &open_buckets); -+ bch2_disk_reservation_put(c, &disk_res); -+ bch2_bkey_buf_exit(&new, c); -+ bch2_bkey_buf_exit(&old, c); -+ -+ if (closure_nr_remaining(&cl) != 1) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Returns -BCH_ERR_transacton_restart if we had to drop locks: -+ */ -+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, -+ subvol_inum inum, u64 end, -+ s64 *i_sectors_delta) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); -+ struct bpos end_pos = POS(inum.inum, end); -+ struct bkey_s_c k; -+ int ret = 0, ret2 = 0; -+ u32 snapshot; -+ -+ while (!ret || -+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ -+ if (ret) -+ ret2 = ret; -+ -+ bch2_trans_begin(trans); -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ continue; -+ -+ bch2_btree_iter_set_snapshot(iter, snapshot); -+ -+ /* -+ * peek_upto() doesn't have ideal semantics for extents: -+ */ -+ k = bch2_btree_iter_peek_upto(iter, end_pos); -+ if (!k.k) -+ break; -+ -+ ret = bkey_err(k); -+ if (ret) -+ continue; -+ -+ bkey_init(&delete.k); -+ delete.k.p = iter->pos; -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end_pos, &delete); -+ -+ ret = bch2_extent_update(trans, inum, iter, &delete, -+ &disk_res, 0, i_sectors_delta, false); -+ bch2_disk_reservation_put(c, &disk_res); -+ } -+ -+ return ret ?: ret2; -+} -+ -+int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, -+ s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ POS(inum.inum, start), -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ ret = 0; -+ -+ return ret; -+} -+ -+static int bch2_write_index_default(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_buf sk; -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct btree_trans trans; -+ struct btree_iter iter; -+ subvol_inum inum = { -+ .subvol = op->subvol, -+ .inum = k->k.p.inode, -+ }; -+ int ret; -+ -+ BUG_ON(!inum.subvol); -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ k = bch2_keylist_front(keys); -+ bch2_bkey_buf_copy(&sk, c, k); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, -+ &sk.k->k.p.snapshot); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ bkey_start_pos(&sk.k->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ ret = bch2_extent_update(&trans, inum, &iter, sk.k, -+ &op->res, -+ op->new_i_size, &op->i_sectors_delta, -+ op->flags & BCH_WRITE_CHECK_ENOSPC); -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_ge(iter.pos, k->k.p)) -+ bch2_keylist_pop_front(&op->insert_keys); -+ else -+ bch2_cut_front(iter.pos, k); -+ } while (!bch2_keylist_empty(keys)); -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* Writes */ -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, -+ enum bch_data_type type, -+ const struct bkey_i *k, -+ bool nocow) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); -+ const struct bch_extent_ptr *ptr; -+ struct bch_write_bio *n; -+ struct bch_dev *ca; -+ -+ BUG_ON(c->opts.nochanges); -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || -+ !c->devs[ptr->dev]); -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (to_entry(ptr + 1) < ptrs.end) { -+ n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, -+ GFP_NOFS, &ca->replica_set)); -+ -+ n->bio.bi_end_io = wbio->bio.bi_end_io; -+ n->bio.bi_private = wbio->bio.bi_private; -+ n->parent = wbio; -+ n->split = true; -+ n->bounce = false; -+ n->put_bio = true; -+ n->bio.bi_opf = wbio->bio.bi_opf; -+ bio_inc_remaining(&wbio->bio); -+ } else { -+ n = wbio; -+ n->split = false; -+ } -+ -+ n->c = c; -+ n->dev = ptr->dev; -+ n->have_ioref = nocow || bch2_dev_get_ioref(ca, -+ type == BCH_DATA_btree ? READ : WRITE); -+ n->nocow = nocow; -+ n->submit_time = local_clock(); -+ n->inode_offset = bkey_start_offset(&k->k); -+ n->bio.bi_iter.bi_sector = ptr->offset; -+ -+ if (likely(n->have_ioref)) { -+ this_cpu_add(ca->io_done->sectors[WRITE][type], -+ bio_sectors(&n->bio)); -+ -+ bio_set_dev(&n->bio, ca->disk_sb.bdev); -+ -+ if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { -+ bio_endio(&n->bio); -+ continue; -+ } -+ -+ submit_bio(&n->bio); -+ } else { -+ n->bio.bi_status = BLK_STS_REMOVED; -+ bio_endio(&n->bio); -+ } -+ } -+} -+ -+static void __bch2_write(struct bch_write_op *); -+ -+static void bch2_write_done(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ EBUG_ON(op->open_buckets.nr); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); -+ bch2_disk_reservation_put(c, &op->res); -+ -+ if (!(op->flags & BCH_WRITE_MOVE)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_write); -+ bch2_keylist_free(&op->insert_keys, op->inline_keys); -+ -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ if (op->end_io) -+ op->end_io(op); -+} -+ -+static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) -+{ -+ struct keylist *keys = &op->insert_keys; -+ struct bch_extent_ptr *ptr; -+ struct bkey_i *src, *dst = keys->keys, *n; -+ -+ for (src = keys->keys; src != keys->top; src = n) { -+ n = bkey_next(src); -+ -+ if (bkey_extent_is_direct_data(&src->k)) { -+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, -+ test_bit(ptr->dev, op->failed.d)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) -+ return -EIO; -+ } -+ -+ if (dst != src) -+ memmove_u64s_down(dst, src, src->k.u64s); -+ dst = bkey_next(dst); -+ } -+ -+ keys->top = dst; -+ return 0; -+} -+ -+/** -+ * bch_write_index - after a write, update index to point to new data -+ */ -+static void __bch2_write_index(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k; -+ unsigned dev; -+ int ret = 0; -+ -+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -+ ret = bch2_write_drop_io_error_ptrs(op); -+ if (ret) -+ goto err; -+ } -+ -+ /* -+ * probably not the ideal place to hook this in, but I don't -+ * particularly want to plumb io_opts all the way through the btree -+ * update stack right now -+ */ -+ for_each_keylist_key(keys, k) -+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); -+ -+ if (!bch2_keylist_empty(keys)) { -+ u64 sectors_start = keylist_sectors(keys); -+ -+ ret = !(op->flags & BCH_WRITE_MOVE) -+ ? bch2_write_index_default(op) -+ : bch2_data_update_index_update(op); -+ -+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); -+ BUG_ON(keylist_sectors(keys) && !ret); -+ -+ op->written += sectors_start - keylist_sectors(keys); -+ -+ if (ret && !bch2_err_matches(ret, EROFS)) { -+ struct bkey_i *k = bch2_keylist_front(&op->insert_keys); -+ -+ bch_err_inum_offset_ratelimited(c, -+ k->k.p.inode, k->k.p.offset << 9, -+ "write error while doing btree update: %s", -+ bch2_err_str(ret)); -+ } -+ -+ if (ret) -+ goto err; -+ } -+out: -+ /* If some a bucket wasn't written, we can't erasure code it: */ -+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) -+ bch2_open_bucket_write_error(c, &op->open_buckets, dev); -+ -+ bch2_open_buckets_put(c, &op->open_buckets); -+ return; -+err: -+ keys->top = keys->keys; -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ goto out; -+} -+ -+static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) -+{ -+ if (state != wp->state) { -+ u64 now = ktime_get_ns(); -+ -+ if (wp->last_state_change && -+ time_after64(now, wp->last_state_change)) -+ wp->time[wp->state] += now - wp->last_state_change; -+ wp->state = state; -+ wp->last_state_change = now; -+ } -+} -+ -+static inline void wp_update_state(struct write_point *wp, bool running) -+{ -+ enum write_point_state state; -+ -+ state = running ? WRITE_POINT_running : -+ !list_empty(&wp->writes) ? WRITE_POINT_waiting_io -+ : WRITE_POINT_stopped; -+ -+ __wp_update_state(wp, state); -+} -+ -+static void bch2_write_index(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct write_point *wp = op->wp; -+ struct workqueue_struct *wq = index_update_wq(op); -+ unsigned long flags; -+ -+ if ((op->flags & BCH_WRITE_DONE) && -+ (op->flags & BCH_WRITE_MOVE)) -+ bch2_bio_free_pages_pool(op->c, &op->wbio.bio); -+ -+ spin_lock_irqsave(&wp->writes_lock, flags); -+ if (wp->state == WRITE_POINT_waiting_io) -+ __wp_update_state(wp, WRITE_POINT_waiting_work); -+ list_add_tail(&op->wp_list, &wp->writes); -+ spin_unlock_irqrestore (&wp->writes_lock, flags); -+ -+ queue_work(wq, &wp->index_update_work); -+} -+ -+static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) -+{ -+ op->wp = wp; -+ -+ if (wp->state == WRITE_POINT_stopped) { -+ spin_lock_irq(&wp->writes_lock); -+ __wp_update_state(wp, WRITE_POINT_waiting_io); -+ spin_unlock_irq(&wp->writes_lock); -+ } -+} -+ -+void bch2_write_point_do_index_updates(struct work_struct *work) -+{ -+ struct write_point *wp = -+ container_of(work, struct write_point, index_update_work); -+ struct bch_write_op *op; -+ -+ while (1) { -+ spin_lock_irq(&wp->writes_lock); -+ op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); -+ if (op) -+ list_del(&op->wp_list); -+ wp_update_state(wp, op != NULL); -+ spin_unlock_irq(&wp->writes_lock); -+ -+ if (!op) -+ break; -+ -+ op->flags |= BCH_WRITE_IN_WORKER; -+ -+ __bch2_write_index(op); -+ -+ if (!(op->flags & BCH_WRITE_DONE)) -+ __bch2_write(op); -+ else -+ bch2_write_done(&op->cl); -+ } -+} -+ -+static void bch2_write_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ -+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, -+ op->pos.inode, -+ wbio->inode_offset << 9, -+ "data write error: %s", -+ bch2_blk_status_to_str(bio->bi_status))) { -+ set_bit(wbio->dev, op->failed.d); -+ op->flags |= BCH_WRITE_IO_ERROR; -+ } -+ -+ if (wbio->nocow) -+ set_bit(wbio->dev, op->devs_need_flush->d); -+ -+ if (wbio->have_ioref) { -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (wbio->bounce) -+ bch2_bio_free_pages_pool(c, bio); -+ -+ if (wbio->put_bio) -+ bio_put(bio); -+ -+ if (parent) -+ bio_endio(&parent->bio); -+ else -+ closure_put(cl); -+} -+ -+static void init_append_extent(struct bch_write_op *op, -+ struct write_point *wp, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bkey_i_extent *e; -+ -+ op->pos.offset += crc.uncompressed_size; -+ -+ e = bkey_extent_init(op->insert_keys.top); -+ e->k.p = op->pos; -+ e->k.size = crc.uncompressed_size; -+ e->k.version = version; -+ -+ if (crc.csum_type || -+ crc.compression_type || -+ crc.nonce) -+ bch2_extent_crc_append(&e->k_i, crc); -+ -+ bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, -+ op->flags & BCH_WRITE_CACHED); -+ -+ bch2_keylist_push(&op->insert_keys); -+} -+ -+static struct bio *bch2_write_bio_alloc(struct bch_fs *c, -+ struct write_point *wp, -+ struct bio *src, -+ bool *page_alloc_failed, -+ void *buf) -+{ -+ struct bch_write_bio *wbio; -+ struct bio *bio; -+ unsigned output_available = -+ min(wp->sectors_free << 9, src->bi_iter.bi_size); -+ unsigned pages = DIV_ROUND_UP(output_available + -+ (buf -+ ? ((unsigned long) buf & (PAGE_SIZE - 1)) -+ : 0), PAGE_SIZE); -+ -+ pages = min(pages, BIO_MAX_VECS); -+ -+ bio = bio_alloc_bioset(NULL, pages, 0, -+ GFP_NOFS, &c->bio_write); -+ wbio = wbio_init(bio); -+ wbio->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ wbio->bio.bi_opf = src->bi_opf; -+ -+ if (buf) { -+ bch2_bio_map(bio, buf, output_available); -+ return bio; -+ } -+ -+ wbio->bounce = true; -+ -+ /* -+ * We can't use mempool for more than c->sb.encoded_extent_max -+ * worth of pages, but we'd like to allocate more if we can: -+ */ -+ bch2_bio_alloc_pages_pool(c, bio, -+ min_t(unsigned, output_available, -+ c->opts.encoded_extent_max)); -+ -+ if (bio->bi_iter.bi_size < output_available) -+ *page_alloc_failed = -+ bch2_bio_alloc_pages(bio, -+ output_available - -+ bio->bi_iter.bi_size, -+ GFP_NOFS) != 0; -+ -+ return bio; -+} -+ -+static int bch2_write_rechecksum(struct bch_fs *c, -+ struct bch_write_op *op, -+ unsigned new_csum_type) -+{ -+ struct bio *bio = &op->wbio.bio; -+ struct bch_extent_crc_unpacked new_crc; -+ int ret; -+ -+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ -+ -+ if (bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)) -+ new_csum_type = op->crc.csum_type; -+ -+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -+ NULL, &new_crc, -+ op->crc.offset, op->crc.live_size, -+ new_csum_type); -+ if (ret) -+ return ret; -+ -+ bio_advance(bio, op->crc.offset << 9); -+ bio->bi_iter.bi_size = op->crc.live_size << 9; -+ op->crc = new_crc; -+ return 0; -+} -+ -+static int bch2_write_decrypt(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ struct bch_csum csum; -+ int ret; -+ -+ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) -+ return 0; -+ -+ /* -+ * If we need to decrypt data in the write path, we'll no longer be able -+ * to verify the existing checksum (poly1305 mac, in this case) after -+ * it's decrypted - this is the last point we'll be able to reverify the -+ * checksum: -+ */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return -EIO; -+ -+ ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ return ret; -+} -+ -+static enum prep_encoded_ret { -+ PREP_ENCODED_OK, -+ PREP_ENCODED_ERR, -+ PREP_ENCODED_CHECKSUM_ERR, -+ PREP_ENCODED_DO_WRITE, -+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *bio = &op->wbio.bio; -+ -+ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -+ return PREP_ENCODED_OK; -+ -+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); -+ -+ /* Can we just write the entire extent as is? */ -+ if (op->crc.uncompressed_size == op->crc.live_size && -+ op->crc.compressed_size <= wp->sectors_free && -+ (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || -+ op->incompressible)) { -+ if (!crc_is_compressed(op->crc) && -+ op->csum_type != op->crc.csum_type && -+ bch2_write_rechecksum(c, op, op->csum_type) && -+ !c->opts.no_data_io) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_DO_WRITE; -+ } -+ -+ /* -+ * If the data is compressed and we couldn't write the entire extent as -+ * is, we have to decompress it: -+ */ -+ if (crc_is_compressed(op->crc)) { -+ struct bch_csum csum; -+ -+ if (bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* Last point we can still verify checksum: */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, -+ extent_nonce(op->version, op->crc), -+ bio); -+ if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) -+ return PREP_ENCODED_ERR; -+ } -+ -+ /* -+ * No longer have compressed data after this point - data might be -+ * encrypted: -+ */ -+ -+ /* -+ * If the data is checksummed and we're only writing a subset, -+ * rechecksum and adjust bio to point to currently live data: -+ */ -+ if ((op->crc.live_size != op->crc.uncompressed_size || -+ op->crc.csum_type != op->csum_type) && -+ bch2_write_rechecksum(c, op, op->csum_type) && -+ !c->opts.no_data_io) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* -+ * If we want to compress the data, it has to be decrypted: -+ */ -+ if ((op->compression_opt || -+ bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(op->csum_type)) && -+ bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_OK; -+} -+ -+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, -+ struct bio **_dst) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *src = &op->wbio.bio, *dst = src; -+ struct bvec_iter saved_iter; -+ void *ec_buf; -+ unsigned total_output = 0, total_input = 0; -+ bool bounce = false; -+ bool page_alloc_failed = false; -+ int ret, more = 0; -+ -+ BUG_ON(!bio_sectors(src)); -+ -+ ec_buf = bch2_writepoint_ec_buf(c, wp); -+ -+ switch (bch2_write_prep_encoded_data(op, wp)) { -+ case PREP_ENCODED_OK: -+ break; -+ case PREP_ENCODED_ERR: -+ ret = -EIO; -+ goto err; -+ case PREP_ENCODED_CHECKSUM_ERR: -+ goto csum_err; -+ case PREP_ENCODED_DO_WRITE: -+ /* XXX look for bug here */ -+ if (ec_buf) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bio_copy_data(dst, src); -+ bounce = true; -+ } -+ init_append_extent(op, wp, op->version, op->crc); -+ goto do_write; -+ } -+ -+ if (ec_buf || -+ op->compression_opt || -+ (op->csum_type && -+ !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ (bch2_csum_type_is_encryption(op->csum_type) && -+ !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bounce = true; -+ } -+ -+ saved_iter = dst->bi_iter; -+ -+ do { -+ struct bch_extent_crc_unpacked crc = { 0 }; -+ struct bversion version = op->version; -+ size_t dst_len, src_len; -+ -+ if (page_alloc_failed && -+ dst->bi_iter.bi_size < (wp->sectors_free << 9) && -+ dst->bi_iter.bi_size < c->opts.encoded_extent_max) -+ break; -+ -+ BUG_ON(op->compression_opt && -+ (op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_csum_type_is_encryption(op->crc.csum_type)); -+ BUG_ON(op->compression_opt && !bounce); -+ -+ crc.compression_type = op->incompressible -+ ? BCH_COMPRESSION_TYPE_incompressible -+ : op->compression_opt -+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, -+ op->compression_opt) -+ : 0; -+ if (!crc_is_compressed(crc)) { -+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); -+ -+ if (op->csum_type) -+ dst_len = min_t(unsigned, dst_len, -+ c->opts.encoded_extent_max); -+ -+ if (bounce) { -+ swap(dst->bi_iter.bi_size, dst_len); -+ bio_copy_data(dst, src); -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ src_len = dst_len; -+ } -+ -+ BUG_ON(!src_len || !dst_len); -+ -+ if (bch2_csum_type_is_encryption(op->csum_type)) { -+ if (bversion_zero(version)) { -+ version.lo = atomic64_inc_return(&c->key_version); -+ } else { -+ crc.nonce = op->nonce; -+ op->nonce += src_len >> 9; -+ } -+ } -+ -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ !crc_is_compressed(crc) && -+ bch2_csum_type_is_encryption(op->crc.csum_type) == -+ bch2_csum_type_is_encryption(op->csum_type)) { -+ u8 compression_type = crc.compression_type; -+ u16 nonce = crc.nonce; -+ /* -+ * Note: when we're using rechecksum(), we need to be -+ * checksumming @src because it has all the data our -+ * existing checksum covers - if we bounced (because we -+ * were trying to compress), @dst will only have the -+ * part of the data the new checksum will cover. -+ * -+ * But normally we want to be checksumming post bounce, -+ * because part of the reason for bouncing is so the -+ * data can't be modified (by userspace) while it's in -+ * flight. -+ */ -+ if (bch2_rechecksum_bio(c, src, version, op->crc, -+ &crc, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->csum_type)) -+ goto csum_err; -+ /* -+ * rchecksum_bio sets compression_type on crc from op->crc, -+ * this isn't always correct as sometimes we're changing -+ * an extent from uncompressed to incompressible. -+ */ -+ crc.compression_type = compression_type; -+ crc.nonce = nonce; -+ } else { -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_rechecksum_bio(c, src, version, op->crc, -+ NULL, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->crc.csum_type)) -+ goto csum_err; -+ -+ crc.compressed_size = dst_len >> 9; -+ crc.uncompressed_size = src_len >> 9; -+ crc.live_size = src_len >> 9; -+ -+ swap(dst->bi_iter.bi_size, dst_len); -+ ret = bch2_encrypt_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ if (ret) -+ goto err; -+ -+ crc.csum = bch2_checksum_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum_type = op->csum_type; -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ init_append_extent(op, wp, version, crc); -+ -+ if (dst != src) -+ bio_advance(dst, dst_len); -+ bio_advance(src, src_len); -+ total_output += dst_len; -+ total_input += src_len; -+ } while (dst->bi_iter.bi_size && -+ src->bi_iter.bi_size && -+ wp->sectors_free && -+ !bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)); -+ -+ more = src->bi_iter.bi_size != 0; -+ -+ dst->bi_iter = saved_iter; -+ -+ if (dst == src && more) { -+ BUG_ON(total_output != total_input); -+ -+ dst = bio_split(src, total_input >> 9, -+ GFP_NOFS, &c->bio_write); -+ wbio_init(dst)->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ dst->bi_opf = src->bi_opf; -+ } -+ -+ dst->bi_iter.bi_size = total_output; -+do_write: -+ *_dst = dst; -+ return more; -+csum_err: -+ bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); -+ ret = -EIO; -+err: -+ if (to_wbio(dst)->bounce) -+ bch2_bio_free_pages_pool(c, dst); -+ if (to_wbio(dst)->put_bio) -+ bio_put(dst); -+ -+ return ret; -+} -+ -+static bool bch2_extent_is_writeable(struct bch_write_op *op, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_s_c_extent e; -+ struct extent_ptr_decoded p; -+ const union bch_extent_entry *entry; -+ unsigned replicas = 0; -+ -+ if (k.k->type != KEY_TYPE_extent) -+ return false; -+ -+ e = bkey_s_c_to_extent(k); -+ extent_for_each_ptr_decode(e, p, entry) { -+ if (p.crc.csum_type || -+ crc_is_compressed(p.crc) || -+ p.has_ec) -+ return false; -+ -+ replicas += bch2_extent_ptr_durability(c, &p); -+ } -+ -+ return replicas >= op->opts.data_replicas; -+} -+ -+static inline void bch2_nocow_write_unlock(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ const struct bch_extent_ptr *ptr; -+ struct bkey_i *k; -+ -+ for_each_keylist_key(&op->insert_keys, k) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ bch2_bucket_nocow_unlock(&c->nocow_locks, -+ PTR_BUCKET_POS(c, ptr), -+ BUCKET_NOCOW_LOCK_UPDATE); -+ } -+} -+ -+static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *orig, -+ struct bkey_s_c k, -+ u64 new_i_size) -+{ -+ struct bkey_i *new; -+ struct bkey_ptrs ptrs; -+ struct bch_extent_ptr *ptr; -+ int ret; -+ -+ if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { -+ /* trace this */ -+ return 0; -+ } -+ -+ new = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ bch2_cut_front(bkey_start_pos(&orig->k), new); -+ bch2_cut_back(orig->k.p, new); -+ -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); -+ bkey_for_each_ptr(ptrs, ptr) -+ ptr->unwritten = 0; -+ -+ /* -+ * Note that we're not calling bch2_subvol_get_snapshot() in this path - -+ * that was done when we kicked off the write, and here it's important -+ * that we update the extent that we wrote to - even if a snapshot has -+ * since been created. The write is still outstanding, so we're ok -+ * w.r.t. snapshot atomicity: -+ */ -+ return bch2_extent_update_i_size_sectors(trans, iter, -+ min(new->k.p.offset << 9, new_i_size), 0) ?: -+ bch2_trans_update(trans, iter, new, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+} -+ -+static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_i *orig; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_keylist_key(&op->insert_keys, orig) { -+ ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, -+ bkey_start_pos(&orig->k), orig->k.p, -+ BTREE_ITER_INTENT, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, ({ -+ bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); -+ })); -+ -+ if (ret && !bch2_err_matches(ret, EROFS)) { -+ struct bkey_i *k = bch2_keylist_front(&op->insert_keys); -+ -+ bch_err_inum_offset_ratelimited(c, -+ k->k.p.inode, k->k.p.offset << 9, -+ "write error while doing btree update: %s", -+ bch2_err_str(ret)); -+ } -+ -+ if (ret) { -+ op->error = ret; -+ break; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void __bch2_nocow_write_done(struct bch_write_op *op) -+{ -+ bch2_nocow_write_unlock(op); -+ -+ if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { -+ op->error = -EIO; -+ } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) -+ bch2_nocow_write_convert_unwritten(op); -+} -+ -+static void bch2_nocow_write_done(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ -+ __bch2_nocow_write_done(op); -+ bch2_write_done(cl); -+} -+ -+static void bch2_nocow_write(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_ptrs_c ptrs; -+ const struct bch_extent_ptr *ptr; -+ struct { -+ struct bpos b; -+ unsigned gen; -+ struct nocow_lock_bucket *l; -+ } buckets[BCH_REPLICAS_MAX]; -+ unsigned nr_buckets = 0; -+ u32 snapshot; -+ int ret, i; -+ -+ if (op->flags & BCH_WRITE_MOVE) -+ return; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ SPOS(op->pos.inode, op->pos.offset, snapshot), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ struct bio *bio = &op->wbio.bio; -+ -+ nr_buckets = 0; -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ /* fall back to normal cow write path? */ -+ if (unlikely(k.k->p.snapshot != snapshot || -+ !bch2_extent_is_writeable(op, k))) -+ break; -+ -+ if (bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ k.k->u64s)) -+ break; -+ -+ /* Get iorefs before dropping btree locks: */ -+ ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr(ptrs, ptr) { -+ buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); -+ buckets[nr_buckets].gen = ptr->gen; -+ buckets[nr_buckets].l = -+ bucket_nocow_lock(&c->nocow_locks, -+ bucket_to_u64(buckets[nr_buckets].b)); -+ -+ prefetch(buckets[nr_buckets].l); -+ -+ if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) -+ goto err_get_ioref; -+ -+ nr_buckets++; -+ -+ if (ptr->unwritten) -+ op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; -+ } -+ -+ /* Unlock before taking nocow locks, doing IO: */ -+ bkey_reassemble(op->insert_keys.top, k); -+ bch2_trans_unlock(&trans); -+ -+ bch2_cut_front(op->pos, op->insert_keys.top); -+ if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) -+ bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); -+ -+ for (i = 0; i < nr_buckets; i++) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); -+ struct nocow_lock_bucket *l = buckets[i].l; -+ bool stale; -+ -+ __bch2_bucket_nocow_lock(&c->nocow_locks, l, -+ bucket_to_u64(buckets[i].b), -+ BUCKET_NOCOW_LOCK_UPDATE); -+ -+ rcu_read_lock(); -+ stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); -+ rcu_read_unlock(); -+ -+ if (unlikely(stale)) -+ goto err_bucket_stale; -+ } -+ -+ bio = &op->wbio.bio; -+ if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { -+ bio = bio_split(bio, k.k->p.offset - op->pos.offset, -+ GFP_KERNEL, &c->bio_write); -+ wbio_init(bio)->put_bio = true; -+ bio->bi_opf = op->wbio.bio.bi_opf; -+ } else { -+ op->flags |= BCH_WRITE_DONE; -+ } -+ -+ op->pos.offset += bio_sectors(bio); -+ op->written += bio_sectors(bio); -+ -+ bio->bi_end_io = bch2_write_endio; -+ bio->bi_private = &op->cl; -+ bio->bi_opf |= REQ_OP_WRITE; -+ closure_get(&op->cl); -+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, -+ op->insert_keys.top, true); -+ -+ bch2_keylist_push(&op->insert_keys); -+ if (op->flags & BCH_WRITE_DONE) -+ break; -+ bch2_btree_iter_advance(&iter); -+ } -+out: -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ if (ret) { -+ bch_err_inum_offset_ratelimited(c, -+ op->pos.inode, -+ op->pos.offset << 9, -+ "%s: btree lookup error %s", -+ __func__, bch2_err_str(ret)); -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ /* fallback to cow write path? */ -+ if (!(op->flags & BCH_WRITE_DONE)) { -+ closure_sync(&op->cl); -+ __bch2_nocow_write_done(op); -+ op->insert_keys.top = op->insert_keys.keys; -+ } else if (op->flags & BCH_WRITE_SYNC) { -+ closure_sync(&op->cl); -+ bch2_nocow_write_done(&op->cl); -+ } else { -+ /* -+ * XXX -+ * needs to run out of process context because ei_quota_lock is -+ * a mutex -+ */ -+ continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); -+ } -+ return; -+err_get_ioref: -+ for (i = 0; i < nr_buckets; i++) -+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); -+ -+ /* Fall back to COW path: */ -+ goto out; -+err_bucket_stale: -+ while (--i >= 0) -+ bch2_bucket_nocow_unlock(&c->nocow_locks, -+ buckets[i].b, -+ BUCKET_NOCOW_LOCK_UPDATE); -+ for (i = 0; i < nr_buckets; i++) -+ percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); -+ -+ /* We can retry this: */ -+ ret = -BCH_ERR_transaction_restart; -+ goto out; -+} -+ -+static void __bch2_write(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct write_point *wp = NULL; -+ struct bio *bio = NULL; -+ unsigned nofs_flags; -+ int ret; -+ -+ nofs_flags = memalloc_nofs_save(); -+ -+ if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { -+ bch2_nocow_write(op); -+ if (op->flags & BCH_WRITE_DONE) -+ goto out_nofs_restore; -+ } -+again: -+ memset(&op->failed, 0, sizeof(op->failed)); -+ -+ do { -+ struct bkey_i *key_to_write; -+ unsigned key_to_write_offset = op->insert_keys.top_p - -+ op->insert_keys.keys_p; -+ -+ /* +1 for possible cache device: */ -+ if (op->open_buckets.nr + op->nr_replicas + 1 > -+ ARRAY_SIZE(op->open_buckets.v)) -+ break; -+ -+ if (bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)) -+ break; -+ -+ /* -+ * The copygc thread is now global, which means it's no longer -+ * freeing up space on specific disks, which means that -+ * allocations for specific disks may hang arbitrarily long: -+ */ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_alloc_sectors_start_trans(&trans, -+ op->target, -+ op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), -+ op->write_point, -+ &op->devs_have, -+ op->nr_replicas, -+ op->nr_replicas_required, -+ op->watermark, -+ op->flags, -+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| -+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) -+ ? NULL : &op->cl, &wp)); -+ if (unlikely(ret)) { -+ if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) -+ break; -+ -+ goto err; -+ } -+ -+ EBUG_ON(!wp); -+ -+ bch2_open_bucket_get(c, wp, &op->open_buckets); -+ ret = bch2_write_extent(op, wp, &bio); -+ -+ bch2_alloc_sectors_done_inlined(c, wp); -+err: -+ if (ret <= 0) { -+ op->flags |= BCH_WRITE_DONE; -+ -+ if (ret < 0) { -+ op->error = ret; -+ break; -+ } -+ } -+ -+ bio->bi_end_io = bch2_write_endio; -+ bio->bi_private = &op->cl; -+ bio->bi_opf |= REQ_OP_WRITE; -+ -+ closure_get(bio->bi_private); -+ -+ key_to_write = (void *) (op->insert_keys.keys_p + -+ key_to_write_offset); -+ -+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, -+ key_to_write, false); -+ } while (ret); -+ -+ /* -+ * Sync or no? -+ * -+ * If we're running asynchronously, wne may still want to block -+ * synchronously here if we weren't able to submit all of the IO at -+ * once, as that signals backpressure to the caller. -+ */ -+ if ((op->flags & BCH_WRITE_SYNC) || -+ (!(op->flags & BCH_WRITE_DONE) && -+ !(op->flags & BCH_WRITE_IN_WORKER))) { -+ closure_sync(&op->cl); -+ __bch2_write_index(op); -+ -+ if (!(op->flags & BCH_WRITE_DONE)) -+ goto again; -+ bch2_write_done(&op->cl); -+ } else { -+ bch2_write_queue(op, wp); -+ continue_at(&op->cl, bch2_write_index, NULL); -+ } -+out_nofs_restore: -+ memalloc_nofs_restore(nofs_flags); -+} -+ -+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -+{ -+ struct bio *bio = &op->wbio.bio; -+ struct bvec_iter iter; -+ struct bkey_i_inline_data *id; -+ unsigned sectors; -+ int ret; -+ -+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -+ op->flags |= BCH_WRITE_DONE; -+ -+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); -+ -+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); -+ if (ret) { -+ op->error = ret; -+ goto err; -+ } -+ -+ sectors = bio_sectors(bio); -+ op->pos.offset += sectors; -+ -+ id = bkey_inline_data_init(op->insert_keys.top); -+ id->k.p = op->pos; -+ id->k.version = op->version; -+ id->k.size = sectors; -+ -+ iter = bio->bi_iter; -+ iter.bi_size = data_len; -+ memcpy_from_bio(id->v.data, bio, iter); -+ -+ while (data_len & 7) -+ id->v.data[data_len++] = '\0'; -+ set_bkey_val_bytes(&id->k, data_len); -+ bch2_keylist_push(&op->insert_keys); -+ -+ __bch2_write_index(op); -+err: -+ bch2_write_done(&op->cl); -+} -+ -+/** -+ * bch_write - handle a write to a cache device or flash only volume -+ * -+ * This is the starting point for any data to end up in a cache device; it could -+ * be from a normal write, or a writeback write, or a write to a flash only -+ * volume - it's also used by the moving garbage collector to compact data in -+ * mostly empty buckets. -+ * -+ * It first writes the data to the cache, creating a list of keys to be inserted -+ * (if the data won't fit in a single open bucket, there will be multiple keys); -+ * after the data is written it calls bch_journal, and after the keys have been -+ * added to the next journal write they're inserted into the btree. -+ * -+ * If op->discard is true, instead of inserting the data it invalidates the -+ * region of the cache represented by op->bio and op->inode. -+ */ -+void bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bio *bio = &op->wbio.bio; -+ struct bch_fs *c = op->c; -+ unsigned data_len; -+ -+ EBUG_ON(op->cl.parent); -+ BUG_ON(!op->nr_replicas); -+ BUG_ON(!op->write_point.v); -+ BUG_ON(bkey_eq(op->pos, POS_MAX)); -+ -+ op->start_time = local_clock(); -+ bch2_keylist_init(&op->insert_keys, op->inline_keys); -+ wbio_init(bio)->put_bio = false; -+ -+ if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { -+ bch_err_inum_offset_ratelimited(c, -+ op->pos.inode, -+ op->pos.offset << 9, -+ "misaligned write"); -+ op->error = -EIO; -+ goto err; -+ } -+ -+ if (c->opts.nochanges) { -+ op->error = -BCH_ERR_erofs_no_writes; -+ goto err; -+ } -+ -+ if (!(op->flags & BCH_WRITE_MOVE) && -+ !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { -+ op->error = -BCH_ERR_erofs_no_writes; -+ goto err; -+ } -+ -+ this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); -+ bch2_increment_clock(c, bio_sectors(bio), WRITE); -+ -+ data_len = min_t(u64, bio->bi_iter.bi_size, -+ op->new_i_size - (op->pos.offset << 9)); -+ -+ if (c->opts.inline_data && -+ data_len <= min(block_bytes(c) / 2, 1024U)) { -+ bch2_write_data_inline(op, data_len); -+ return; -+ } -+ -+ __bch2_write(op); -+ return; -+err: -+ bch2_disk_reservation_put(c, &op->res); -+ -+ closure_debug_destroy(&op->cl); -+ if (op->end_io) -+ op->end_io(op); -+} -+ -+static const char * const bch2_write_flags[] = { -+#define x(f) #f, -+ BCH_WRITE_FLAGS() -+#undef x -+ NULL -+}; -+ -+void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) -+{ -+ prt_str(out, "pos: "); -+ bch2_bpos_to_text(out, op->pos); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ prt_str(out, "started: "); -+ bch2_pr_time_units(out, local_clock() - op->start_time); -+ prt_newline(out); -+ -+ prt_str(out, "flags: "); -+ prt_bitflags(out, bch2_write_flags, op->flags); -+ prt_newline(out); -+ -+ prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); -+ prt_newline(out); -+ -+ printbuf_indent_sub(out, 2); -+} -+ -+/* Cache promotion on read */ -+ -+struct promote_op { -+ struct rcu_head rcu; -+ u64 start_time; -+ -+ struct rhash_head hash; -+ struct bpos pos; -+ -+ struct data_update write; -+ struct bio_vec bi_inline_vecs[0]; /* must be last */ -+}; -+ -+static const struct rhashtable_params bch_promote_params = { -+ .head_offset = offsetof(struct promote_op, hash), -+ .key_offset = offsetof(struct promote_op, pos), -+ .key_len = sizeof(struct bpos), -+}; -+ -+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, -+ struct bpos pos, -+ struct bch_io_opts opts, -+ unsigned flags) -+{ -+ if (!(flags & BCH_READ_MAY_PROMOTE)) -+ return false; -+ -+ if (!opts.promote_target) -+ return false; -+ -+ if (bch2_bkey_has_target(c, k, opts.promote_target)) -+ return false; -+ -+ if (bkey_extent_is_unwritten(k)) -+ return false; -+ -+ if (bch2_target_congested(c, opts.promote_target)) { -+ /* XXX trace this */ -+ return false; -+ } -+ -+ if (rhashtable_lookup_fast(&c->promote_table, &pos, -+ bch_promote_params)) -+ return false; -+ -+ return true; -+} -+ -+static void promote_free(struct bch_fs *c, struct promote_op *op) -+{ -+ int ret; -+ -+ bch2_data_update_exit(&op->write); -+ -+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ bch2_write_ref_put(c, BCH_WRITE_REF_promote); -+ kfree_rcu(op, rcu); -+} -+ -+static void promote_done(struct bch_write_op *wop) -+{ -+ struct promote_op *op = -+ container_of(wop, struct promote_op, write.op); -+ struct bch_fs *c = op->write.op.c; -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -+ op->start_time); -+ promote_free(c, op); -+} -+ -+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+{ -+ struct bio *bio = &op->write.op.wbio.bio; -+ -+ trace_and_count(op->write.op.c, read_promote, &rbio->bio); -+ -+ /* we now own pages: */ -+ BUG_ON(!rbio->bounce); -+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+ -+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ -+ bch2_data_update_read_done(&op->write, rbio->pick.crc); -+} -+ -+static struct promote_op *__promote_alloc(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned sectors, -+ struct bch_read_bio **rbio) -+{ -+ struct bch_fs *c = trans->c; -+ struct promote_op *op = NULL; -+ struct bio *bio; -+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ int ret; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) -+ return NULL; -+ -+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); -+ if (!op) -+ goto err; -+ -+ op->start_time = local_clock(); -+ op->pos = pos; -+ -+ /* -+ * We don't use the mempool here because extents that aren't -+ * checksummed or compressed can be too big for the mempool: -+ */ -+ *rbio = kzalloc(sizeof(struct bch_read_bio) + -+ sizeof(struct bio_vec) * pages, -+ GFP_NOFS); -+ if (!*rbio) -+ goto err; -+ -+ rbio_init(&(*rbio)->bio, opts); -+ bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); -+ -+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, -+ GFP_NOFS)) -+ goto err; -+ -+ (*rbio)->bounce = true; -+ (*rbio)->split = true; -+ (*rbio)->kmalloc = true; -+ -+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, -+ bch_promote_params)) -+ goto err; -+ -+ bio = &op->write.op.wbio.bio; -+ bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); -+ -+ ret = bch2_data_update_init(trans, NULL, &op->write, -+ writepoint_hashed((unsigned long) current), -+ opts, -+ (struct data_update_opts) { -+ .target = opts.promote_target, -+ .extra_replicas = 1, -+ .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, -+ }, -+ btree_id, k); -+ /* -+ * possible errors: -BCH_ERR_nocow_lock_blocked, -+ * -BCH_ERR_ENOSPC_disk_reservation: -+ */ -+ if (ret) { -+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ goto err; -+ } -+ -+ op->write.op.end_io = promote_done; -+ -+ return op; -+err: -+ if (*rbio) -+ bio_free_pages(&(*rbio)->bio); -+ kfree(*rbio); -+ *rbio = NULL; -+ kfree(op); -+ bch2_write_ref_put(c, BCH_WRITE_REF_promote); -+ return NULL; -+} -+ -+noinline -+static struct promote_op *promote_alloc(struct btree_trans *trans, -+ struct bvec_iter iter, -+ struct bkey_s_c k, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned flags, -+ struct bch_read_bio **rbio, -+ bool *bounce, -+ bool *read_full) -+{ -+ struct bch_fs *c = trans->c; -+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); -+ /* data might have to be decompressed in the write path: */ -+ unsigned sectors = promote_full -+ ? max(pick->crc.compressed_size, pick->crc.live_size) -+ : bvec_iter_sectors(iter); -+ struct bpos pos = promote_full -+ ? bkey_start_pos(k.k) -+ : POS(k.k->p.inode, iter.bi_sector); -+ struct promote_op *promote; -+ -+ if (!should_promote(c, k, pos, opts, flags)) -+ return NULL; -+ -+ promote = __promote_alloc(trans, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_reflink -+ : BTREE_ID_extents, -+ k, pos, pick, opts, sectors, rbio); -+ if (!promote) -+ return NULL; -+ -+ *bounce = true; -+ *read_full = promote_full; -+ return promote; -+} -+ -+/* Read */ -+ -+#define READ_RETRY_AVOID 1 -+#define READ_RETRY 2 -+#define READ_ERR 3 -+ -+enum rbio_context { -+ RBIO_CONTEXT_NULL, -+ RBIO_CONTEXT_HIGHPRI, -+ RBIO_CONTEXT_UNBOUND, -+}; -+ -+static inline struct bch_read_bio * -+bch2_rbio_parent(struct bch_read_bio *rbio) -+{ -+ return rbio->split ? rbio->parent : rbio; -+} -+ -+__always_inline -+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, -+ enum rbio_context context, -+ struct workqueue_struct *wq) -+{ -+ if (context <= rbio->context) { -+ fn(&rbio->work); -+ } else { -+ rbio->work.func = fn; -+ rbio->context = context; -+ queue_work(wq, &rbio->work); -+ } -+} -+ -+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -+{ -+ BUG_ON(rbio->bounce && !rbio->split); -+ -+ if (rbio->promote) -+ promote_free(rbio->c, rbio->promote); -+ rbio->promote = NULL; -+ -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ -+ if (rbio->split) { -+ struct bch_read_bio *parent = rbio->parent; -+ -+ if (rbio->kmalloc) -+ kfree(rbio); -+ else -+ bio_put(&rbio->bio); -+ -+ rbio = parent; -+ } -+ -+ return rbio; -+} -+ -+/* -+ * Only called on a top level bch_read_bio to complete an entire read request, -+ * not a split: -+ */ -+static void bch2_rbio_done(struct bch_read_bio *rbio) -+{ -+ if (rbio->start_time) -+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], -+ rbio->start_time); -+ bio_endio(&rbio->bio); -+} -+ -+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, -+ struct bch_io_failures *failed, -+ unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_buf sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, rbio->data_btree, -+ rbio->read_pos, BTREE_ITER_SLOTS); -+retry: -+ rbio->bio.bi_status = 0; -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ if (bkey_err(k)) -+ goto err; -+ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ if (!bch2_bkey_matches_ptr(c, k, -+ rbio->pick.ptr, -+ rbio->data_pos.offset - -+ rbio->pick.crc.offset)) { -+ /* extent we wanted to read no longer exists: */ -+ rbio->hole = true; -+ goto out; -+ } -+ -+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, -+ rbio->read_pos, -+ rbio->data_btree, -+ k, 0, failed, flags); -+ if (ret == READ_RETRY) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_rbio_done(rbio); -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ return; -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ goto out; -+} -+ -+static void bch2_rbio_retry(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bvec_iter iter = rbio->bvec_iter; -+ unsigned flags = rbio->flags; -+ subvol_inum inum = { -+ .subvol = rbio->subvol, -+ .inum = rbio->read_pos.inode, -+ }; -+ struct bch_io_failures failed = { .nr = 0 }; -+ -+ trace_and_count(c, read_retry, &rbio->bio); -+ -+ if (rbio->retry == READ_RETRY_AVOID) -+ bch2_mark_io_failure(&failed, &rbio->pick); -+ -+ rbio->bio.bi_status = 0; -+ -+ rbio = bch2_rbio_free(rbio); -+ -+ flags |= BCH_READ_IN_RETRY; -+ flags &= ~BCH_READ_MAY_PROMOTE; -+ -+ if (flags & BCH_READ_NODECODE) { -+ bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); -+ } else { -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ __bch2_read(c, rbio, iter, inum, &failed, flags); -+ } -+} -+ -+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, -+ blk_status_t error) -+{ -+ rbio->retry = retry; -+ -+ if (rbio->flags & BCH_READ_IN_RETRY) -+ return; -+ -+ if (retry == READ_ERR) { -+ rbio = bch2_rbio_free(rbio); -+ -+ rbio->bio.bi_status = error; -+ bch2_rbio_done(rbio); -+ } else { -+ bch2_rbio_punt(rbio, bch2_rbio_retry, -+ RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ } -+} -+ -+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, -+ struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; -+ struct bch_extent_crc_unpacked new_crc; -+ struct btree_iter iter; -+ struct bkey_i *new; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (crc_is_compressed(rbio->pick.crc)) -+ return 0; -+ -+ k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ if ((ret = bkey_err(k))) -+ goto out; -+ -+ if (bversion_cmp(k.k->version, rbio->version) || -+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) -+ goto out; -+ -+ /* Extent was merged? */ -+ if (bkey_start_offset(k.k) < data_offset || -+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) -+ goto out; -+ -+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, -+ rbio->pick.crc, NULL, &new_crc, -+ bkey_start_offset(k.k) - data_offset, k.k->size, -+ rbio->pick.crc.csum_type)) { -+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); -+ ret = 0; -+ goto out; -+ } -+ -+ /* -+ * going to be temporarily appending another checksum entry: -+ */ -+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + -+ sizeof(struct bch_extent_crc128)); -+ if ((ret = PTR_ERR_OR_ZERO(new))) -+ goto out; -+ -+ bkey_reassemble(new, k); -+ -+ if (!bch2_bkey_narrow_crcs(new, new_crc)) -+ goto out; -+ -+ ret = bch2_trans_update(trans, &iter, new, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -+{ -+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, -+ __bch2_rbio_narrow_crcs(&trans, rbio)); -+} -+ -+/* Inner part that may run in process context */ -+static void __bch2_read_endio(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct bio *src = &rbio->bio; -+ struct bio *dst = &bch2_rbio_parent(rbio)->bio; -+ struct bvec_iter dst_iter = rbio->bvec_iter; -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ unsigned nofs_flags; -+ struct bch_csum csum; -+ int ret; -+ -+ nofs_flags = memalloc_nofs_save(); -+ -+ /* Reset iterator for checksumming and copying bounced data: */ -+ if (rbio->bounce) { -+ src->bi_iter.bi_size = crc.compressed_size << 9; -+ src->bi_iter.bi_idx = 0; -+ src->bi_iter.bi_bvec_done = 0; -+ } else { -+ src->bi_iter = rbio->bvec_iter; -+ } -+ -+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) -+ goto csum_err; -+ -+ /* -+ * XXX -+ * We need to rework the narrow_crcs path to deliver the read completion -+ * first, and then punt to a different workqueue, otherwise we're -+ * holding up reads while doing btree updates which is bad for memory -+ * reclaim. -+ */ -+ if (unlikely(rbio->narrow_crcs)) -+ bch2_rbio_narrow_crcs(rbio); -+ -+ if (rbio->flags & BCH_READ_NODECODE) -+ goto nodecode; -+ -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ -+ if (crc_is_compressed(crc)) { -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; -+ -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && -+ !c->opts.no_data_io) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); -+ -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; -+ -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; -+ -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ -+ if (rbio->promote) { -+ /* -+ * Re encrypt data we decrypted, so it's consistent with -+ * rbio->crc: -+ */ -+ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (ret) -+ goto decrypt_err; -+ -+ promote_start(rbio->promote, rbio); -+ rbio->promote = NULL; -+ } -+nodecode: -+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ rbio = bch2_rbio_free(rbio); -+ bch2_rbio_done(rbio); -+ } -+out: -+ memalloc_nofs_restore(nofs_flags); -+ return; -+csum_err: -+ /* -+ * Checksum error: if the bio wasn't bounced, we may have been -+ * reading into buffers owned by userspace (that userspace can -+ * scribble over) - retry the read, bouncing it this time: -+ */ -+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -+ rbio->flags |= BCH_READ_MUST_BOUNCE; -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ bch_err_inum_offset_ratelimited(ca, -+ rbio->read_pos.inode, -+ rbio->read_pos.offset << 9, -+ "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", -+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, -+ csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); -+ bch2_io_error(ca); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+decompression_err: -+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, -+ rbio->read_pos.offset << 9, -+ "decompression error"); -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ goto out; -+decrypt_err: -+ bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, -+ rbio->read_pos.offset << 9, -+ "decrypt error"); -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ goto out; -+} -+ -+static void bch2_read_endio(struct bio *bio) -+{ -+ struct bch_read_bio *rbio = -+ container_of(bio, struct bch_read_bio, bio); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct workqueue_struct *wq = NULL; -+ enum rbio_context context = RBIO_CONTEXT_NULL; -+ -+ if (rbio->have_ioref) { -+ bch2_latency_acct(ca, rbio->submit_time, READ); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (!rbio->split) -+ rbio->bio.bi_end_io = rbio->end_io; -+ -+ if (bch2_dev_inum_io_err_on(bio->bi_status, ca, -+ rbio->read_pos.inode, -+ rbio->read_pos.offset, -+ "data read error: %s", -+ bch2_blk_status_to_str(bio->bi_status))) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ return; -+ } -+ -+ if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ ptr_stale(ca, &rbio->pick.ptr)) { -+ trace_and_count(c, read_reuse_race, &rbio->bio); -+ -+ if (rbio->flags & BCH_READ_RETRY_IF_STALE) -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); -+ else -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -+ return; -+ } -+ -+ if (rbio->narrow_crcs || -+ rbio->promote || -+ crc_is_compressed(rbio->pick.crc) || -+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) -+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; -+ else if (rbio->pick.crc.csum_type) -+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; -+ -+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -+} -+ -+int __bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_buf *orig_k) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ u64 reflink_offset; -+ int ret; -+ -+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + -+ *offset_into_extent; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, -+ POS(0, reflink_offset), 0); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_reflink_v && -+ k.k->type != KEY_TYPE_indirect_inline_data) { -+ bch_err_inum_offset_ratelimited(trans->c, -+ orig_k->k->k.p.inode, -+ orig_k->k->k.p.offset << 9, -+ "%llu len %u points to nonexistent indirect extent %llu", -+ orig_k->k->k.p.offset, -+ orig_k->k->k.size, -+ reflink_offset); -+ bch2_inconsistent_error(trans->c); -+ ret = -EIO; -+ goto err; -+ } -+ -+ *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); -+ bch2_bkey_buf_reassemble(orig_k, trans->c, k); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, -+ struct bkey_s_c k, -+ struct bch_extent_ptr ptr) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); -+ struct btree_iter iter; -+ struct printbuf buf = PRINTBUF; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, -+ PTR_BUCKET_POS(c, &ptr), -+ BTREE_ITER_CACHED); -+ -+ prt_printf(&buf, "Attempting to read from stale dirty pointer:"); -+ printbuf_indent_add(&buf, 2); -+ prt_newline(&buf); -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ prt_newline(&buf); -+ -+ prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); -+ -+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -+ if (!ret) { -+ prt_newline(&buf); -+ bch2_bkey_val_to_text(&buf, c, k); -+ } -+ -+ bch2_fs_inconsistent(c, "%s", buf.buf); -+ -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf); -+} -+ -+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, -+ struct bvec_iter iter, struct bpos read_pos, -+ enum btree_id data_btree, struct bkey_s_c k, -+ unsigned offset_into_extent, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct extent_ptr_decoded pick; -+ struct bch_read_bio *rbio = NULL; -+ struct bch_dev *ca = NULL; -+ struct promote_op *promote = NULL; -+ bool bounce = false, read_full = false, narrow_crcs = false; -+ struct bpos data_pos = bkey_start_pos(k.k); -+ int pick_ret; -+ -+ if (bkey_extent_is_inline_data(k.k)) { -+ unsigned bytes = min_t(unsigned, iter.bi_size, -+ bkey_inline_data_bytes(k.k)); -+ -+ swap(iter.bi_size, bytes); -+ memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); -+ swap(iter.bi_size, bytes); -+ bio_advance_iter(&orig->bio, &iter, bytes); -+ zero_fill_bio_iter(&orig->bio, iter); -+ goto out_read_done; -+ } -+retry_pick: -+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ -+ /* hole or reservation - just zero fill: */ -+ if (!pick_ret) -+ goto hole; -+ -+ if (pick_ret < 0) { -+ bch_err_inum_offset_ratelimited(c, -+ read_pos.inode, read_pos.offset << 9, -+ "no device to read from"); -+ goto err; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ /* -+ * Stale dirty pointers are treated as IO errors, but @failed isn't -+ * allocated unless we're in the retry path - so if we're not in the -+ * retry path, don't check here, it'll be caught in bch2_read_endio() -+ * and we'll end up in the retry path: -+ */ -+ if ((flags & BCH_READ_IN_RETRY) && -+ !pick.ptr.cached && -+ unlikely(ptr_stale(ca, &pick.ptr))) { -+ read_from_stale_dirty_pointer(trans, k, pick.ptr); -+ bch2_mark_io_failure(failed, &pick); -+ goto retry_pick; -+ } -+ -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ bch2_trans_unlock(trans); -+ -+ if (flags & BCH_READ_NODECODE) { -+ /* -+ * can happen if we retry, and the extent we were going to read -+ * has been merged in the meantime: -+ */ -+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) -+ goto hole; -+ -+ iter.bi_size = pick.crc.compressed_size << 9; -+ goto get_bio; -+ } -+ -+ if (!(flags & BCH_READ_LAST_FRAGMENT) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_MUST_CLONE; -+ -+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -+ flags |= BCH_READ_MUST_BOUNCE; -+ -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_none && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_USER_MAPPED)) || -+ (flags & BCH_READ_MUST_BOUNCE)))) { -+ read_full = true; -+ bounce = true; -+ } -+ -+ if (orig->opts.promote_target) -+ promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, -+ &rbio, &bounce, &read_full); -+ -+ if (!read_full) { -+ EBUG_ON(crc_is_compressed(pick.crc)); -+ EBUG_ON(pick.crc.csum_type && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ bvec_iter_sectors(iter) != pick.crc.live_size || -+ pick.crc.offset || -+ offset_into_extent)); -+ -+ data_pos.offset += offset_into_extent; -+ pick.ptr.offset += pick.crc.offset + -+ offset_into_extent; -+ offset_into_extent = 0; -+ pick.crc.compressed_size = bvec_iter_sectors(iter); -+ pick.crc.uncompressed_size = bvec_iter_sectors(iter); -+ pick.crc.offset = 0; -+ pick.crc.live_size = bvec_iter_sectors(iter); -+ offset_into_extent = 0; -+ } -+get_bio: -+ if (rbio) { -+ /* -+ * promote already allocated bounce rbio: -+ * promote needs to allocate a bio big enough for uncompressing -+ * data in the write path, but we're not going to use it all -+ * here: -+ */ -+ EBUG_ON(rbio->bio.bi_iter.bi_size < -+ pick.crc.compressed_size << 9); -+ rbio->bio.bi_iter.bi_size = -+ pick.crc.compressed_size << 9; -+ } else if (bounce) { -+ unsigned sectors = pick.crc.compressed_size; -+ -+ rbio = rbio_init(bio_alloc_bioset(NULL, -+ DIV_ROUND_UP(sectors, PAGE_SECTORS), -+ 0, -+ GFP_NOFS, -+ &c->bio_read_split), -+ orig->opts); -+ -+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); -+ rbio->bounce = true; -+ rbio->split = true; -+ } else if (flags & BCH_READ_MUST_CLONE) { -+ /* -+ * Have to clone if there were any splits, due to error -+ * reporting issues (if a split errored, and retrying didn't -+ * work, when it reports the error to its parent (us) we don't -+ * know if the error was from our bio, and we should retry, or -+ * from the whole bio, in which case we don't want to retry and -+ * lose the error) -+ */ -+ rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, -+ &c->bio_read_split), -+ orig->opts); -+ rbio->bio.bi_iter = iter; -+ rbio->split = true; -+ } else { -+ rbio = orig; -+ rbio->bio.bi_iter = iter; -+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); -+ } -+ -+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); -+ -+ rbio->c = c; -+ rbio->submit_time = local_clock(); -+ if (rbio->split) -+ rbio->parent = orig; -+ else -+ rbio->end_io = orig->bio.bi_end_io; -+ rbio->bvec_iter = iter; -+ rbio->offset_into_extent= offset_into_extent; -+ rbio->flags = flags; -+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); -+ rbio->narrow_crcs = narrow_crcs; -+ rbio->hole = 0; -+ rbio->retry = 0; -+ rbio->context = 0; -+ /* XXX: only initialize this if needed */ -+ rbio->devs_have = bch2_bkey_devs(k); -+ rbio->pick = pick; -+ rbio->subvol = orig->subvol; -+ rbio->read_pos = read_pos; -+ rbio->data_btree = data_btree; -+ rbio->data_pos = data_pos; -+ rbio->version = k.k->version; -+ rbio->promote = promote; -+ INIT_WORK(&rbio->work, NULL); -+ -+ rbio->bio.bi_opf = orig->bio.bi_opf; -+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; -+ rbio->bio.bi_end_io = bch2_read_endio; -+ -+ if (rbio->bounce) -+ trace_and_count(c, read_bounce, &rbio->bio); -+ -+ this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); -+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); -+ -+ /* -+ * If it's being moved internally, we don't want to flag it as a cache -+ * hit: -+ */ -+ if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) -+ bch2_bucket_io_time_reset(trans, pick.ptr.dev, -+ PTR_BUCKET_NR(ca, &pick.ptr), READ); -+ -+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ bio_inc_remaining(&orig->bio); -+ trace_and_count(c, read_split, &orig->bio); -+ } -+ -+ if (!rbio->pick.idx) { -+ if (!rbio->have_ioref) { -+ bch_err_inum_offset_ratelimited(c, -+ read_pos.inode, -+ read_pos.offset << 9, -+ "no device to read from"); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], -+ bio_sectors(&rbio->bio)); -+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); -+ -+ if (unlikely(c->opts.no_data_io)) { -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ bio_endio(&rbio->bio); -+ } else { -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ submit_bio(&rbio->bio); -+ else -+ submit_bio_wait(&rbio->bio); -+ } -+ -+ /* -+ * We just submitted IO which may block, we expect relock fail -+ * events and shouldn't count them: -+ */ -+ trans->notrace_relock_fail = true; -+ } else { -+ /* Attempting reconstruct read: */ -+ if (bch2_ec_read_extent(c, rbio)) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ bio_endio(&rbio->bio); -+ } -+out: -+ if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ return 0; -+ } else { -+ int ret; -+ -+ rbio->context = RBIO_CONTEXT_UNBOUND; -+ bch2_read_endio(&rbio->bio); -+ -+ ret = rbio->retry; -+ rbio = bch2_rbio_free(rbio); -+ -+ if (ret == READ_RETRY_AVOID) { -+ bch2_mark_io_failure(failed, &pick); -+ ret = READ_RETRY; -+ } -+ -+ if (!ret) -+ goto out_read_done; -+ -+ return ret; -+ } -+ -+err: -+ if (flags & BCH_READ_IN_RETRY) -+ return READ_ERR; -+ -+ orig->bio.bi_status = BLK_STS_IOERR; -+ goto out_read_done; -+ -+hole: -+ /* -+ * won't normally happen in the BCH_READ_NODECODE -+ * (bch2_move_extent()) path, but if we retry and the extent we wanted -+ * to read no longer exists we have to signal that: -+ */ -+ if (flags & BCH_READ_NODECODE) -+ orig->hole = true; -+ -+ zero_fill_bio_iter(&orig->bio, iter); -+out_read_done: -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ bch2_rbio_done(orig); -+ return 0; -+} -+ -+void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, subvol_inum inum, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_buf sk; -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ BUG_ON(flags & BCH_READ_NODECODE); -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ iter = (struct btree_iter) { NULL }; -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ SPOS(inum.inum, bvec_iter.bi_sector, snapshot), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ unsigned bytes, sectors, offset_into_extent; -+ enum btree_id data_btree = BTREE_ID_extents; -+ -+ /* -+ * read_extent -> io_time_reset may cause a transaction restart -+ * without returning an error, we need to check for that here: -+ */ -+ ret = bch2_trans_relock(&trans); -+ if (ret) -+ break; -+ -+ bch2_btree_iter_set_pos(&iter, -+ POS(inum.inum, bvec_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ offset_into_extent = iter.pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ -+ ret = bch2_read_indirect_extent(&trans, &data_btree, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ k = bkey_i_to_s_c(sk.k); -+ -+ /* -+ * With indirect extents, the amount of data to read is the min -+ * of the original extent and the indirect extent: -+ */ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; -+ swap(bvec_iter.bi_size, bytes); -+ -+ if (bvec_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, -+ data_btree, k, -+ offset_into_extent, failed, flags); -+ if (ret) -+ break; -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(bvec_iter.bi_size, bytes); -+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -+ -+ ret = btree_trans_too_many_iters(&trans); -+ if (ret) -+ break; -+ } -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || -+ ret == READ_RETRY || -+ ret == READ_RETRY_AVOID) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ if (ret) { -+ bch_err_inum_offset_ratelimited(c, inum.inum, -+ bvec_iter.bi_sector << 9, -+ "read error %i from btree lookup", ret); -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ bch2_rbio_done(rbio); -+ } -+} -+ -+void bch2_fs_io_exit(struct bch_fs *c) -+{ -+ if (c->promote_table.tbl) -+ rhashtable_destroy(&c->promote_table); -+ mempool_exit(&c->bio_bounce_pages); -+ bioset_exit(&c->bio_write); -+ bioset_exit(&c->bio_read_split); -+ bioset_exit(&c->bio_read); -+} -+ -+int bch2_fs_io_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_bio_read_init; -+ -+ if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_bio_read_split_init; -+ -+ if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), -+ BIOSET_NEED_BVECS)) -+ return -BCH_ERR_ENOMEM_bio_write_init; -+ -+ if (mempool_init_page_pool(&c->bio_bounce_pages, -+ max_t(unsigned, -+ c->opts.btree_node_size, -+ c->opts.encoded_extent_max) / -+ PAGE_SIZE, 0)) -+ return -BCH_ERR_ENOMEM_bio_bounce_pages_init; -+ -+ if (rhashtable_init(&c->promote_table, &bch_promote_params)) -+ return -BCH_ERR_ENOMEM_promote_table_init; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h -new file mode 100644 -index 000000000..831e3f1b7 ---- /dev/null -+++ b/fs/bcachefs/io.h -@@ -0,0 +1,202 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_H -+#define _BCACHEFS_IO_H -+ -+#include "checksum.h" -+#include "bkey_buf.h" -+#include "io_types.h" -+ -+#define to_wbio(_bio) \ -+ container_of((_bio), struct bch_write_bio, bio) -+ -+#define to_rbio(_bio) \ -+ container_of((_bio), struct bch_read_bio, bio) -+ -+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -+ -+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -+void bch2_latency_acct(struct bch_dev *, u64, int); -+#else -+static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -+#endif -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, -+ enum bch_data_type, const struct bkey_i *, bool); -+ -+#define BLK_STS_REMOVED ((__force blk_status_t)128) -+ -+const char *bch2_blk_status_to_str(blk_status_t); -+ -+#define BCH_WRITE_FLAGS() \ -+ x(ALLOC_NOWAIT) \ -+ x(CACHED) \ -+ x(DATA_ENCODED) \ -+ x(PAGES_STABLE) \ -+ x(PAGES_OWNED) \ -+ x(ONLY_SPECIFIED_DEVS) \ -+ x(WROTE_DATA_INLINE) \ -+ x(FROM_INTERNAL) \ -+ x(CHECK_ENOSPC) \ -+ x(SYNC) \ -+ x(MOVE) \ -+ x(IN_WORKER) \ -+ x(DONE) \ -+ x(IO_ERROR) \ -+ x(CONVERT_UNWRITTEN) -+ -+enum __bch_write_flags { -+#define x(f) __BCH_WRITE_##f, -+ BCH_WRITE_FLAGS() -+#undef x -+}; -+ -+enum bch_write_flags { -+#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), -+ BCH_WRITE_FLAGS() -+#undef x -+}; -+ -+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -+{ -+ return op->watermark == BCH_WATERMARK_copygc -+ ? op->c->copygc_wq -+ : op->c->btree_update_wq; -+} -+ -+int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, bool *, s64 *, s64 *); -+int bch2_extent_update(struct btree_trans *, subvol_inum, -+ struct btree_iter *, struct bkey_i *, -+ struct disk_reservation *, u64, s64 *, bool); -+int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, -+ unsigned, struct bch_io_opts, s64 *, -+ struct write_point_specifier); -+ -+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, -+ subvol_inum, u64, s64 *); -+int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); -+ -+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, -+ struct bch_io_opts opts) -+{ -+ op->c = c; -+ op->end_io = NULL; -+ op->flags = 0; -+ op->written = 0; -+ op->error = 0; -+ op->csum_type = bch2_data_checksum_type(c, opts); -+ op->compression_opt = opts.compression; -+ op->nr_replicas = 0; -+ op->nr_replicas_required = c->opts.data_replicas_required; -+ op->watermark = BCH_WATERMARK_normal; -+ op->incompressible = 0; -+ op->open_buckets.nr = 0; -+ op->devs_have.nr = 0; -+ op->target = 0; -+ op->opts = opts; -+ op->subvol = 0; -+ op->pos = POS_MAX; -+ op->version = ZERO_VERSION; -+ op->write_point = (struct write_point_specifier) { 0 }; -+ op->res = (struct disk_reservation) { 0 }; -+ op->new_i_size = U64_MAX; -+ op->i_sectors_delta = 0; -+ op->devs_need_flush = NULL; -+} -+ -+void bch2_write(struct closure *); -+ -+void bch2_write_point_do_index_updates(struct work_struct *); -+ -+static inline struct bch_write_bio *wbio_init(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ -+ memset(&wbio->wbio, 0, sizeof(wbio->wbio)); -+ return wbio; -+} -+ -+void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); -+ -+struct bch_devs_mask; -+struct cache_promote_op; -+struct extent_ptr_decoded; -+ -+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, -+ struct bkey_buf *); -+ -+static inline int bch2_read_indirect_extent(struct btree_trans *trans, -+ enum btree_id *data_btree, -+ unsigned *offset_into_extent, -+ struct bkey_buf *k) -+{ -+ if (k->k->k.type != KEY_TYPE_reflink_p) -+ return 0; -+ -+ *data_btree = BTREE_ID_reflink; -+ return __bch2_read_indirect_extent(trans, offset_into_extent, k); -+} -+ -+enum bch_read_flags { -+ BCH_READ_RETRY_IF_STALE = 1 << 0, -+ BCH_READ_MAY_PROMOTE = 1 << 1, -+ BCH_READ_USER_MAPPED = 1 << 2, -+ BCH_READ_NODECODE = 1 << 3, -+ BCH_READ_LAST_FRAGMENT = 1 << 4, -+ -+ /* internal: */ -+ BCH_READ_MUST_BOUNCE = 1 << 5, -+ BCH_READ_MUST_CLONE = 1 << 6, -+ BCH_READ_IN_RETRY = 1 << 7, -+}; -+ -+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, -+ struct bvec_iter, struct bpos, enum btree_id, -+ struct bkey_s_c, unsigned, -+ struct bch_io_failures *, unsigned); -+ -+static inline void bch2_read_extent(struct btree_trans *trans, -+ struct bch_read_bio *rbio, struct bpos read_pos, -+ enum btree_id data_btree, struct bkey_s_c k, -+ unsigned offset_into_extent, unsigned flags) -+{ -+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, -+ data_btree, k, offset_into_extent, NULL, flags); -+} -+ -+void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, -+ subvol_inum, struct bch_io_failures *, unsigned flags); -+ -+static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, -+ subvol_inum inum) -+{ -+ struct bch_io_failures failed = { .nr = 0 }; -+ -+ BUG_ON(rbio->_state); -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ rbio->subvol = inum.subvol; -+ -+ __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, -+ BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE| -+ BCH_READ_USER_MAPPED); -+} -+ -+static inline struct bch_read_bio *rbio_init(struct bio *bio, -+ struct bch_io_opts opts) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->_state = 0; -+ rbio->promote = NULL; -+ rbio->opts = opts; -+ return rbio; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *); -+int bch2_fs_io_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_IO_H */ -diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h -new file mode 100644 -index 000000000..737f16d78 ---- /dev/null -+++ b/fs/bcachefs/io_types.h -@@ -0,0 +1,165 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_TYPES_H -+#define _BCACHEFS_IO_TYPES_H -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "extents_types.h" -+#include "keylist_types.h" -+#include "opts.h" -+#include "super_types.h" -+ -+#include -+#include -+ -+struct bch_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ u64 submit_time; -+ -+ /* -+ * Reads will often have to be split, and if the extent being read from -+ * was checksummed or compressed we'll also have to allocate bounce -+ * buffers and copy the data back into the original bio. -+ * -+ * If we didn't have to split, we have to save and restore the original -+ * bi_end_io - @split below indicates which: -+ */ -+ union { -+ struct bch_read_bio *parent; -+ bio_end_io_t *end_io; -+ }; -+ -+ /* -+ * Saved copy of bio->bi_iter, from submission time - allows us to -+ * resubmit on IO error, and also to copy data back to the original bio -+ * when we're bouncing: -+ */ -+ struct bvec_iter bvec_iter; -+ -+ unsigned offset_into_extent; -+ -+ u16 flags; -+ union { -+ struct { -+ u16 bounce:1, -+ split:1, -+ kmalloc:1, -+ have_ioref:1, -+ narrow_crcs:1, -+ hole:1, -+ retry:2, -+ context:2; -+ }; -+ u16 _state; -+ }; -+ -+ struct bch_devs_list devs_have; -+ -+ struct extent_ptr_decoded pick; -+ -+ /* -+ * pos we read from - different from data_pos for indirect extents: -+ */ -+ u32 subvol; -+ struct bpos read_pos; -+ -+ /* -+ * start pos of data we read (may not be pos of data we want) - for -+ * promote, narrow extents paths: -+ */ -+ enum btree_id data_btree; -+ struct bpos data_pos; -+ struct bversion version; -+ -+ struct promote_op *promote; -+ -+ struct bch_io_opts opts; -+ -+ struct work_struct work; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_bio { -+ struct_group(wbio, -+ struct bch_fs *c; -+ struct bch_write_bio *parent; -+ -+ u64 submit_time; -+ u64 inode_offset; -+ -+ struct bch_devs_list failed; -+ u8 dev; -+ -+ unsigned split:1, -+ bounce:1, -+ put_bio:1, -+ have_ioref:1, -+ nocow:1, -+ used_mempool:1, -+ first_btree_write:1; -+ ); -+ -+ struct bio bio; -+}; -+ -+struct bch_write_op { -+ struct closure cl; -+ struct bch_fs *c; -+ void (*end_io)(struct bch_write_op *); -+ u64 start_time; -+ -+ unsigned written; /* sectors */ -+ u16 flags; -+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ -+ -+ unsigned compression_opt:8; -+ unsigned csum_type:4; -+ unsigned nr_replicas:4; -+ unsigned nr_replicas_required:4; -+ unsigned watermark:3; -+ unsigned incompressible:1; -+ unsigned stripe_waited:1; -+ -+ struct bch_devs_list devs_have; -+ u16 target; -+ u16 nonce; -+ struct bch_io_opts opts; -+ -+ u32 subvol; -+ struct bpos pos; -+ struct bversion version; -+ -+ /* For BCH_WRITE_DATA_ENCODED: */ -+ struct bch_extent_crc_unpacked crc; -+ -+ struct write_point_specifier write_point; -+ -+ struct write_point *wp; -+ struct list_head wp_list; -+ -+ struct disk_reservation res; -+ -+ struct open_buckets open_buckets; -+ -+ u64 new_i_size; -+ s64 i_sectors_delta; -+ -+ struct bch_devs_mask failed; -+ -+ struct keylist insert_keys; -+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; -+ -+ /* -+ * Bitmask of devices that have had nocow writes issued to them since -+ * last flush: -+ */ -+ struct bch_devs_mask *devs_need_flush; -+ -+ /* Must be last: */ -+ struct bch_write_bio wbio; -+}; -+ -+#endif /* _BCACHEFS_IO_TYPES_H */ -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -new file mode 100644 -index 000000000..055920c26 ---- /dev/null -+++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1438 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs journalling code, for btree insertions -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_sb.h" -+#include "journal_seq_blacklist.h" -+#include "trace.h" -+ -+static const char * const bch2_journal_errors[] = { -+#define x(n) #n, -+ JOURNAL_ERRORS() -+#undef x -+ NULL -+}; -+ -+static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -+{ -+ return seq > j->seq_ondisk; -+} -+ -+static bool __journal_entry_is_open(union journal_res_state state) -+{ -+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -+} -+ -+static inline unsigned nr_unwritten_journal_entries(struct journal *j) -+{ -+ return atomic64_read(&j->seq) - j->seq_ondisk; -+} -+ -+static bool journal_entry_is_open(struct journal *j) -+{ -+ return __journal_entry_is_open(j->reservations); -+} -+ -+static inline struct journal_buf * -+journal_seq_to_buf(struct journal *j, u64 seq) -+{ -+ struct journal_buf *buf = NULL; -+ -+ EBUG_ON(seq > journal_cur_seq(j)); -+ -+ if (journal_seq_unwritten(j, seq)) { -+ buf = j->buf + (seq & JOURNAL_BUF_MASK); -+ EBUG_ON(le64_to_cpu(buf->data->seq) != seq); -+ } -+ return buf; -+} -+ -+static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(p->list); i++) -+ INIT_LIST_HEAD(&p->list[i]); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, count); -+ p->devs.nr = 0; -+} -+ -+/* -+ * Detect stuck journal conditions and trigger shutdown. Technically the journal -+ * can end up stuck for a variety of reasons, such as a blocked I/O, journal -+ * reservation lockup, etc. Since this is a fatal error with potentially -+ * unpredictable characteristics, we want to be fairly conservative before we -+ * decide to shut things down. -+ * -+ * Consider the journal stuck when it appears full with no ability to commit -+ * btree transactions, to discard journal buckets, nor acquire priority -+ * (reserved watermark) reservation. -+ */ -+static inline bool -+journal_error_check_stuck(struct journal *j, int error, unsigned flags) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ bool stuck = false; -+ struct printbuf buf = PRINTBUF; -+ -+ if (!(error == JOURNAL_ERR_journal_full || -+ error == JOURNAL_ERR_journal_pin_full) || -+ nr_unwritten_journal_entries(j) || -+ (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) -+ return stuck; -+ -+ spin_lock(&j->lock); -+ -+ if (j->can_discard) { -+ spin_unlock(&j->lock); -+ return stuck; -+ } -+ -+ stuck = true; -+ -+ /* -+ * The journal shutdown path will set ->err_seq, but do it here first to -+ * serialize against concurrent failures and avoid duplicate error -+ * reports. -+ */ -+ if (j->err_seq) { -+ spin_unlock(&j->lock); -+ return stuck; -+ } -+ j->err_seq = journal_cur_seq(j); -+ spin_unlock(&j->lock); -+ -+ bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", -+ bch2_journal_errors[error]); -+ bch2_journal_debug_to_text(&buf, j); -+ bch_err(c, "%s", buf.buf); -+ -+ printbuf_reset(&buf); -+ bch2_journal_pins_to_text(&buf, j); -+ bch_err(c, "Journal pins:\n%s", buf.buf); -+ printbuf_exit(&buf); -+ -+ bch2_fatal_error(c); -+ dump_stack(); -+ -+ return stuck; -+} -+ -+/* journal entry close/open: */ -+ -+void __bch2_journal_buf_put(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ -+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); -+} -+ -+/* -+ * Returns true if journal entry is now closed: -+ * -+ * We don't close a journal_buf until the next journal_buf is finished writing, -+ * and can be opened again - this also initializes the next journal_buf: -+ */ -+static void __journal_entry_close(struct journal *j, unsigned closed_val) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ unsigned sectors; -+ -+ BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && -+ closed_val != JOURNAL_ENTRY_ERROR_VAL); -+ -+ lockdep_assert_held(&j->lock); -+ -+ do { -+ old.v = new.v = v; -+ new.cur_entry_offset = closed_val; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || -+ old.cur_entry_offset == new.cur_entry_offset) -+ return; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ if (!__journal_entry_is_open(old)) -+ return; -+ -+ /* Close out old buffer: */ -+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ -+ sectors = vstruct_blocks_plus(buf->data, c->block_bits, -+ buf->u64s_reserved) << c->block_bits; -+ BUG_ON(sectors > buf->sectors); -+ buf->sectors = sectors; -+ -+ /* -+ * We have to set last_seq here, _before_ opening a new journal entry: -+ * -+ * A threads may replace an old pin with a new pin on their current -+ * journal reservation - the expectation being that the journal will -+ * contain either what the old pin protected or what the new pin -+ * protects. -+ * -+ * After the old pin is dropped journal_last_seq() won't include the old -+ * pin, so we can only write the updated last_seq on the entry that -+ * contains whatever the new pin protects. -+ * -+ * Restated, we can _not_ update last_seq for a given entry if there -+ * could be a newer entry open with reservations/pins that have been -+ * taken against it. -+ * -+ * Hence, we want update/set last_seq on the current journal entry right -+ * before we open a new one: -+ */ -+ buf->last_seq = journal_last_seq(j); -+ buf->data->last_seq = cpu_to_le64(buf->last_seq); -+ BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); -+ -+ __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); -+ -+ cancel_delayed_work(&j->write_work); -+ -+ bch2_journal_space_available(j); -+ -+ bch2_journal_buf_put(j, old.idx); -+} -+ -+void bch2_journal_halt(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); -+ if (!j->err_seq) -+ j->err_seq = journal_cur_seq(j); -+ journal_wake(j); -+ spin_unlock(&j->lock); -+} -+ -+static bool journal_entry_want_write(struct journal *j) -+{ -+ bool ret = !journal_entry_is_open(j) || -+ journal_cur_seq(j) == journal_last_unwritten_seq(j); -+ -+ /* Don't close it yet if we already have a write in flight: */ -+ if (ret) -+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); -+ else if (nr_unwritten_journal_entries(j)) { -+ struct journal_buf *buf = journal_cur_buf(j); -+ -+ if (!buf->flush_time) { -+ buf->flush_time = local_clock() ?: 1; -+ buf->expires = jiffies; -+ } -+ } -+ -+ return ret; -+} -+ -+static bool journal_entry_close(struct journal *j) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = journal_entry_want_write(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * should _only_ called from journal_res_get() - when we actually want a -+ * journal reservation - journal entry is open means journal is dirty: -+ */ -+static int journal_entry_open(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = j->buf + -+ ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); -+ union journal_res_state old, new; -+ int u64s; -+ u64 v; -+ -+ lockdep_assert_held(&j->lock); -+ BUG_ON(journal_entry_is_open(j)); -+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); -+ -+ if (j->blocked) -+ return JOURNAL_ERR_blocked; -+ -+ if (j->cur_entry_error) -+ return j->cur_entry_error; -+ -+ if (bch2_journal_error(j)) -+ return JOURNAL_ERR_insufficient_devices; /* -EROFS */ -+ -+ if (!fifo_free(&j->pin)) -+ return JOURNAL_ERR_journal_pin_full; -+ -+ if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) -+ return JOURNAL_ERR_max_in_flight; -+ -+ BUG_ON(!j->cur_entry_sectors); -+ -+ buf->expires = -+ (journal_cur_seq(j) == j->flushed_seq_ondisk -+ ? jiffies -+ : j->last_flush_write) + -+ msecs_to_jiffies(c->opts.journal_flush_delay); -+ -+ buf->u64s_reserved = j->entry_u64s_reserved; -+ buf->disk_sectors = j->cur_entry_sectors; -+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); -+ -+ u64s = (int) (buf->sectors << 9) / sizeof(u64) - -+ journal_entry_overhead(j); -+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); -+ -+ if (u64s <= (ssize_t) j->early_journal_entries.nr) -+ return JOURNAL_ERR_journal_full; -+ -+ if (fifo_empty(&j->pin) && j->reclaim_thread) -+ wake_up_process(j->reclaim_thread); -+ -+ /* -+ * The fifo_push() needs to happen at the same time as j->seq is -+ * incremented for journal_last_seq() to be calculated correctly -+ */ -+ atomic64_inc(&j->seq); -+ journal_pin_list_init(fifo_push_ref(&j->pin), 1); -+ -+ BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); -+ -+ bkey_extent_init(&buf->key); -+ buf->noflush = false; -+ buf->must_flush = false; -+ buf->separate_flush = false; -+ buf->flush_time = 0; -+ -+ memset(buf->data, 0, sizeof(*buf->data)); -+ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); -+ buf->data->u64s = 0; -+ -+ if (j->early_journal_entries.nr) { -+ memcpy(buf->data->_data, j->early_journal_entries.data, -+ j->early_journal_entries.nr * sizeof(u64)); -+ le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); -+ } -+ -+ /* -+ * Must be set before marking the journal entry as open: -+ */ -+ j->cur_entry_u64s = u64s; -+ -+ v = atomic64_read(&j->reservations.counter); -+ do { -+ old.v = new.v = v; -+ -+ BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); -+ -+ new.idx++; -+ BUG_ON(journal_state_count(new, new.idx)); -+ BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); -+ -+ journal_state_inc(&new); -+ -+ /* Handle any already added entries */ -+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ if (j->res_get_blocked_start) -+ bch2_time_stats_update(j->blocked_time, -+ j->res_get_blocked_start); -+ j->res_get_blocked_start = 0; -+ -+ mod_delayed_work(c->io_complete_wq, -+ &j->write_work, -+ msecs_to_jiffies(c->opts.journal_flush_delay)); -+ journal_wake(j); -+ -+ if (j->early_journal_entries.nr) -+ darray_exit(&j->early_journal_entries); -+ return 0; -+} -+ -+static bool journal_quiesced(struct journal *j) -+{ -+ bool ret = atomic64_read(&j->seq) == j->seq_ondisk; -+ -+ if (!ret) -+ journal_entry_close(j); -+ return ret; -+} -+ -+static void journal_quiesce(struct journal *j) -+{ -+ wait_event(j->wait, journal_quiesced(j)); -+} -+ -+static void journal_write_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(work, struct journal, write_work.work); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ long delta; -+ -+ spin_lock(&j->lock); -+ if (!__journal_entry_is_open(j->reservations)) -+ goto unlock; -+ -+ delta = journal_cur_buf(j)->expires - jiffies; -+ -+ if (delta > 0) -+ mod_delayed_work(c->io_complete_wq, &j->write_work, delta); -+ else -+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); -+unlock: -+ spin_unlock(&j->lock); -+} -+ -+static int __journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf; -+ bool can_discard; -+ int ret; -+retry: -+ if (journal_res_get_fast(j, res, flags)) -+ return 0; -+ -+ if (bch2_journal_error(j)) -+ return -BCH_ERR_erofs_journal_err; -+ -+ spin_lock(&j->lock); -+ -+ /* check once more in case somebody else shut things down... */ -+ if (bch2_journal_error(j)) { -+ spin_unlock(&j->lock); -+ return -BCH_ERR_erofs_journal_err; -+ } -+ -+ /* -+ * Recheck after taking the lock, so we don't race with another thread -+ * that just did journal_entry_open() and call journal_entry_close() -+ * unnecessarily -+ */ -+ if (journal_res_get_fast(j, res, flags)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) { -+ /* -+ * Don't want to close current journal entry, just need to -+ * invoke reclaim: -+ */ -+ ret = JOURNAL_ERR_journal_full; -+ goto unlock; -+ } -+ -+ /* -+ * If we couldn't get a reservation because the current buf filled up, -+ * and we had room for a bigger entry on disk, signal that we want to -+ * realloc the journal bufs: -+ */ -+ buf = journal_cur_buf(j); -+ if (journal_entry_is_open(j) && -+ buf->buf_size >> 9 < buf->disk_sectors && -+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) -+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); -+ -+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); -+ ret = journal_entry_open(j); -+ -+ if (ret == JOURNAL_ERR_max_in_flight) -+ trace_and_count(c, journal_entry_full, c); -+unlock: -+ if ((ret && ret != JOURNAL_ERR_insufficient_devices) && -+ !j->res_get_blocked_start) { -+ j->res_get_blocked_start = local_clock() ?: 1; -+ trace_and_count(c, journal_full, c); -+ } -+ -+ can_discard = j->can_discard; -+ spin_unlock(&j->lock); -+ -+ if (!ret) -+ goto retry; -+ if (journal_error_check_stuck(j, ret, flags)) -+ ret = -BCH_ERR_journal_res_get_blocked; -+ -+ /* -+ * Journal is full - can't rely on reclaim from work item due to -+ * freezing: -+ */ -+ if ((ret == JOURNAL_ERR_journal_full || -+ ret == JOURNAL_ERR_journal_pin_full) && -+ !(flags & JOURNAL_RES_GET_NONBLOCK)) { -+ if (can_discard) { -+ bch2_journal_do_discards(j); -+ goto retry; -+ } -+ -+ if (mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } -+ } -+ -+ return ret == JOURNAL_ERR_insufficient_devices -+ ? -BCH_ERR_erofs_journal_err -+ : -BCH_ERR_journal_res_get_blocked; -+} -+ -+/* -+ * Essentially the entry function to the journaling code. When bcachefs is doing -+ * a btree insert, it calls this function to get the current journal write. -+ * Journal write is the structure used set up journal writes. The calling -+ * function will then add its keys to the structure, queuing them for the next -+ * write. -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. -+ */ -+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || -+ (flags & JOURNAL_RES_GET_NONBLOCK)); -+ return ret; -+} -+ -+/* journal_preres: */ -+ -+static bool journal_preres_available(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); -+ -+ if (!ret && mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } -+ -+ return ret; -+} -+ -+int __bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->preres_wait, -+ (ret = bch2_journal_error(j)) || -+ journal_preres_available(j, res, new_u64s, flags)); -+ return ret; -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *j, -+ struct journal_entry_res *res, -+ unsigned new_u64s) -+{ -+ union journal_res_state state; -+ int d = new_u64s - res->u64s; -+ -+ spin_lock(&j->lock); -+ -+ j->entry_u64s_reserved += d; -+ if (d <= 0) -+ goto out; -+ -+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); -+ smp_mb(); -+ state = READ_ONCE(j->reservations); -+ -+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && -+ state.cur_entry_offset > j->cur_entry_u64s) { -+ j->cur_entry_u64s += d; -+ /* -+ * Not enough room in current journal entry, have to flush it: -+ */ -+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); -+ } else { -+ journal_cur_buf(j)->u64s_reserved += d; -+ } -+out: -+ spin_unlock(&j->lock); -+ res->u64s += d; -+} -+ -+/* journal flushing: */ -+ -+/** -+ * bch2_journal_flush_seq_async - wait for a journal entry to be written -+ * -+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if -+ * necessary -+ */ -+int bch2_journal_flush_seq_async(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ int ret = 0; -+ -+ if (seq <= j->flushed_seq_ondisk) -+ return 1; -+ -+ spin_lock(&j->lock); -+ -+ if (WARN_ONCE(seq > journal_cur_seq(j), -+ "requested to flush journal seq %llu, but currently at %llu", -+ seq, journal_cur_seq(j))) -+ goto out; -+ -+ /* Recheck under lock: */ -+ if (j->err_seq && seq >= j->err_seq) { -+ ret = -EIO; -+ goto out; -+ } -+ -+ if (seq <= j->flushed_seq_ondisk) { -+ ret = 1; -+ goto out; -+ } -+ -+ /* if seq was written, but not flushed - flush a newer one instead */ -+ seq = max(seq, journal_last_unwritten_seq(j)); -+ -+recheck_need_open: -+ if (seq > journal_cur_seq(j)) { -+ struct journal_res res = { 0 }; -+ -+ if (journal_entry_is_open(j)) -+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); -+ -+ spin_unlock(&j->lock); -+ -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ if (ret) -+ return ret; -+ -+ seq = res.seq; -+ buf = j->buf + (seq & JOURNAL_BUF_MASK); -+ buf->must_flush = true; -+ -+ if (!buf->flush_time) { -+ buf->flush_time = local_clock() ?: 1; -+ buf->expires = jiffies; -+ } -+ -+ if (parent && !closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ bch2_journal_res_put(j, &res); -+ -+ spin_lock(&j->lock); -+ goto want_write; -+ } -+ -+ /* -+ * if write was kicked off without a flush, flush the next sequence -+ * number instead -+ */ -+ buf = journal_seq_to_buf(j, seq); -+ if (buf->noflush) { -+ seq++; -+ goto recheck_need_open; -+ } -+ -+ buf->must_flush = true; -+ -+ if (parent && !closure_wait(&buf->wait, parent)) -+ BUG(); -+want_write: -+ if (seq == journal_cur_seq(j)) -+ journal_entry_want_write(j); -+out: -+ spin_unlock(&j->lock); -+ return ret; -+} -+ -+int bch2_journal_flush_seq(struct journal *j, u64 seq) -+{ -+ u64 start_time = local_clock(); -+ int ret, ret2; -+ -+ /* -+ * Don't update time_stats when @seq is already flushed: -+ */ -+ if (seq <= j->flushed_seq_ondisk) -+ return 0; -+ -+ ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); -+ -+ if (!ret) -+ bch2_time_stats_update(j->flush_seq_time, start_time); -+ -+ return ret ?: ret2 < 0 ? ret2 : 0; -+} -+ -+/* -+ * bch2_journal_flush_async - if there is an open journal entry, or a journal -+ * still being written, write it and wait for the write to complete -+ */ -+void bch2_journal_flush_async(struct journal *j, struct closure *parent) -+{ -+ bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); -+} -+ -+int bch2_journal_flush(struct journal *j) -+{ -+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); -+} -+ -+/* -+ * bch2_journal_noflush_seq - tell the journal not to issue any flushes before -+ * @seq -+ */ -+bool bch2_journal_noflush_seq(struct journal *j, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ u64 unwritten_seq; -+ bool ret = false; -+ -+ if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) -+ return false; -+ -+ if (seq <= c->journal.flushed_seq_ondisk) -+ return false; -+ -+ spin_lock(&j->lock); -+ if (seq <= c->journal.flushed_seq_ondisk) -+ goto out; -+ -+ for (unwritten_seq = journal_last_unwritten_seq(j); -+ unwritten_seq < seq; -+ unwritten_seq++) { -+ struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); -+ -+ /* journal write is already in flight, and was a flush write: */ -+ if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) -+ goto out; -+ -+ buf->noflush = true; -+ } -+ -+ ret = true; -+out: -+ spin_unlock(&j->lock); -+ return ret; -+} -+ -+int bch2_journal_meta(struct journal *j) -+{ -+ struct journal_buf *buf; -+ struct journal_res res; -+ int ret; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ if (ret) -+ return ret; -+ -+ buf = j->buf + (res.seq & JOURNAL_BUF_MASK); -+ buf->must_flush = true; -+ -+ if (!buf->flush_time) { -+ buf->flush_time = local_clock() ?: 1; -+ buf->expires = jiffies; -+ } -+ -+ bch2_journal_res_put(j, &res); -+ -+ return bch2_journal_flush_seq(j, res.seq); -+} -+ -+/* block/unlock the journal: */ -+ -+void bch2_journal_unblock(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked--; -+ spin_unlock(&j->lock); -+ -+ journal_wake(j); -+} -+ -+void bch2_journal_block(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked++; -+ spin_unlock(&j->lock); -+ -+ journal_quiesce(j); -+} -+ -+/* allocate journal on a device: */ -+ -+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, -+ bool new_fs, struct closure *cl) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ u64 *new_bucket_seq = NULL, *new_buckets = NULL; -+ struct open_bucket **ob = NULL; -+ long *bu = NULL; -+ unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; -+ int ret = 0; -+ -+ BUG_ON(nr <= ja->nr); -+ -+ bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); -+ ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); -+ new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); -+ new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); -+ if (!bu || !ob || !new_buckets || !new_bucket_seq) { -+ ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; -+ goto err_free; -+ } -+ -+ for (nr_got = 0; nr_got < nr_want; nr_got++) { -+ if (new_fs) { -+ bu[nr_got] = bch2_bucket_alloc_new_fs(ca); -+ if (bu[nr_got] < 0) { -+ ret = -BCH_ERR_ENOSPC_bucket_alloc; -+ break; -+ } -+ } else { -+ ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); -+ ret = PTR_ERR_OR_ZERO(ob[nr_got]); -+ if (ret) -+ break; -+ -+ ret = bch2_trans_run(c, -+ bch2_trans_mark_metadata_bucket(&trans, ca, -+ ob[nr_got]->bucket, BCH_DATA_journal, -+ ca->mi.bucket_size)); -+ if (ret) { -+ bch2_open_bucket_put(c, ob[nr_got]); -+ bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); -+ break; -+ } -+ -+ bu[nr_got] = ob[nr_got]->bucket; -+ } -+ } -+ -+ if (!nr_got) -+ goto err_free; -+ -+ /* Don't return an error if we successfully allocated some buckets: */ -+ ret = 0; -+ -+ if (c) { -+ bch2_journal_flush_all_pins(&c->journal); -+ bch2_journal_block(&c->journal); -+ mutex_lock(&c->sb_lock); -+ } -+ -+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); -+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); -+ -+ BUG_ON(ja->discard_idx > ja->nr); -+ -+ pos = ja->discard_idx ?: ja->nr; -+ -+ memmove(new_buckets + pos + nr_got, -+ new_buckets + pos, -+ sizeof(new_buckets[0]) * (ja->nr - pos)); -+ memmove(new_bucket_seq + pos + nr_got, -+ new_bucket_seq + pos, -+ sizeof(new_bucket_seq[0]) * (ja->nr - pos)); -+ -+ for (i = 0; i < nr_got; i++) { -+ new_buckets[pos + i] = bu[i]; -+ new_bucket_seq[pos + i] = 0; -+ } -+ -+ nr = ja->nr + nr_got; -+ -+ ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); -+ if (ret) -+ goto err_unblock; -+ -+ if (!new_fs) -+ bch2_write_super(c); -+ -+ /* Commit: */ -+ if (c) -+ spin_lock(&c->journal.lock); -+ -+ swap(new_buckets, ja->buckets); -+ swap(new_bucket_seq, ja->bucket_seq); -+ ja->nr = nr; -+ -+ if (pos <= ja->discard_idx) -+ ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; -+ if (pos <= ja->dirty_idx_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; -+ if (pos <= ja->dirty_idx) -+ ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; -+ if (pos <= ja->cur_idx) -+ ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; -+ -+ if (c) -+ spin_unlock(&c->journal.lock); -+err_unblock: -+ if (c) { -+ bch2_journal_unblock(&c->journal); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (ret && !new_fs) -+ for (i = 0; i < nr_got; i++) -+ bch2_trans_run(c, -+ bch2_trans_mark_metadata_bucket(&trans, ca, -+ bu[i], BCH_DATA_free, 0)); -+err_free: -+ if (!new_fs) -+ for (i = 0; i < nr_got; i++) -+ bch2_open_bucket_put(c, ob[i]); -+ -+ kfree(new_bucket_seq); -+ kfree(new_buckets); -+ kfree(ob); -+ kfree(bu); -+ return ret; -+} -+ -+/* -+ * Allocate more journal space at runtime - not currently making use if it, but -+ * the code works: -+ */ -+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct closure cl; -+ int ret = 0; -+ -+ closure_init_stack(&cl); -+ -+ down_write(&c->state_lock); -+ -+ /* don't handle reducing nr of buckets yet: */ -+ if (nr < ja->nr) -+ goto unlock; -+ -+ while (ja->nr < nr) { -+ struct disk_reservation disk_res = { 0, 0 }; -+ -+ /* -+ * note: journal buckets aren't really counted as _sectors_ used yet, so -+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c -+ * when space used goes up without a reservation - but we do need the -+ * reservation to ensure we'll actually be able to allocate: -+ * -+ * XXX: that's not right, disk reservations only ensure a -+ * filesystem-wide allocation will succeed, this is a device -+ * specific allocation - we can hang here: -+ */ -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ bucket_to_sector(ca, nr - ja->nr), 1, 0); -+ if (ret) -+ break; -+ -+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ closure_sync(&cl); -+ -+ if (ret && ret != -BCH_ERR_bucket_alloc_blocked) -+ break; -+ } -+ -+ if (ret) -+ bch_err_fn(c, ret); -+unlock: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+int bch2_dev_journal_alloc(struct bch_dev *ca) -+{ -+ unsigned nr; -+ int ret; -+ -+ if (dynamic_fault("bcachefs:add:journal_alloc")) { -+ ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; -+ goto err; -+ } -+ -+ /* 1/128th of the device by default: */ -+ nr = ca->mi.nbuckets >> 7; -+ -+ /* -+ * clamp journal size to 8192 buckets or 8GB (in sectors), whichever -+ * is smaller: -+ */ -+ nr = clamp_t(unsigned, nr, -+ BCH_JOURNAL_BUCKETS_MIN, -+ min(1 << 13, -+ (1 << 24) / ca->mi.bucket_size)); -+ -+ ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); -+err: -+ if (ret) -+ bch_err_fn(ca, ret); -+ return ret; -+} -+ -+/* startup/shutdown: */ -+ -+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -+{ -+ bool ret = false; -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ for (seq = journal_last_unwritten_seq(j); -+ seq <= journal_cur_seq(j) && !ret; -+ seq++) { -+ struct journal_buf *buf = journal_seq_to_buf(j, seq); -+ -+ if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) -+ ret = true; -+ } -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -+{ -+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -+} -+ -+void bch2_fs_journal_stop(struct journal *j) -+{ -+ bch2_journal_reclaim_stop(j); -+ bch2_journal_flush_all_pins(j); -+ -+ wait_event(j->wait, journal_entry_close(j)); -+ -+ /* -+ * Always write a new journal entry, to make sure the clock hands are up -+ * to date (and match the superblock) -+ */ -+ bch2_journal_meta(j); -+ -+ journal_quiesce(j); -+ -+ BUG_ON(!bch2_journal_error(j) && -+ test_bit(JOURNAL_REPLAY_DONE, &j->flags) && -+ j->last_empty_seq != journal_cur_seq(j)); -+ -+ cancel_delayed_work_sync(&j->write_work); -+} -+ -+int bch2_fs_journal_start(struct journal *j, u64 cur_seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ struct journal_replay *i, **_i; -+ struct genradix_iter iter; -+ bool had_entries = false; -+ unsigned ptr; -+ u64 last_seq = cur_seq, nr, seq; -+ -+ genradix_for_each_reverse(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ last_seq = le64_to_cpu(i->j.last_seq); -+ break; -+ } -+ -+ nr = cur_seq - last_seq; -+ -+ if (nr + 1 > j->pin.size) { -+ free_fifo(&j->pin); -+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -+ if (!j->pin.data) { -+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -+ return -BCH_ERR_ENOMEM_journal_pin_fifo; -+ } -+ } -+ -+ j->replay_journal_seq = last_seq; -+ j->replay_journal_seq_end = cur_seq; -+ j->last_seq_ondisk = last_seq; -+ j->flushed_seq_ondisk = cur_seq - 1; -+ j->seq_ondisk = cur_seq - 1; -+ j->pin.front = last_seq; -+ j->pin.back = cur_seq; -+ atomic64_set(&j->seq, cur_seq - 1); -+ -+ fifo_for_each_entry_ptr(p, &j->pin, seq) -+ journal_pin_list_init(p, 1); -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ seq = le64_to_cpu(i->j.seq); -+ BUG_ON(seq >= cur_seq); -+ -+ if (seq < last_seq) -+ continue; -+ -+ if (journal_entry_empty(&i->j)) -+ j->last_empty_seq = le64_to_cpu(i->j.seq); -+ -+ p = journal_seq_pin(j, seq); -+ -+ p->devs.nr = 0; -+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) -+ bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); -+ -+ had_entries = true; -+ } -+ -+ if (!had_entries) -+ j->last_empty_seq = cur_seq; -+ -+ spin_lock(&j->lock); -+ -+ set_bit(JOURNAL_STARTED, &j->flags); -+ j->last_flush_write = jiffies; -+ -+ j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); -+ j->reservations.unwritten_idx++; -+ -+ c->last_bucket_seq_cleanup = journal_cur_seq(j); -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ return bch2_journal_reclaim_start(j); -+} -+ -+/* init/exit: */ -+ -+void bch2_dev_journal_exit(struct bch_dev *ca) -+{ -+ kfree(ca->journal.bio); -+ kfree(ca->journal.buckets); -+ kfree(ca->journal.bucket_seq); -+ -+ ca->journal.bio = NULL; -+ ca->journal.buckets = NULL; -+ ca->journal.bucket_seq = NULL; -+} -+ -+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(sb); -+ struct bch_sb_field_journal_v2 *journal_buckets_v2 = -+ bch2_sb_get_journal_v2(sb); -+ unsigned i, nr_bvecs; -+ -+ ja->nr = 0; -+ -+ if (journal_buckets_v2) { -+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); -+ -+ for (i = 0; i < nr; i++) -+ ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); -+ } else if (journal_buckets) { -+ ja->nr = bch2_nr_journal_buckets(journal_buckets); -+ } -+ -+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->bucket_seq) -+ return -BCH_ERR_ENOMEM_dev_journal_init; -+ -+ nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); -+ -+ ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); -+ if (!ca->journal.bio) -+ return -BCH_ERR_ENOMEM_dev_journal_init; -+ -+ bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); -+ -+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->buckets) -+ return -BCH_ERR_ENOMEM_dev_journal_init; -+ -+ if (journal_buckets_v2) { -+ unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); -+ unsigned j, dst = 0; -+ -+ for (i = 0; i < nr; i++) -+ for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) -+ ja->buckets[dst++] = -+ le64_to_cpu(journal_buckets_v2->d[i].start) + j; -+ } else if (journal_buckets) { -+ for (i = 0; i < ja->nr; i++) -+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); -+ } -+ -+ return 0; -+} -+ -+void bch2_fs_journal_exit(struct journal *j) -+{ -+ unsigned i; -+ -+ darray_exit(&j->early_journal_entries); -+ -+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) -+ kvpfree(j->buf[i].data, j->buf[i].buf_size); -+ free_fifo(&j->pin); -+} -+ -+int bch2_fs_journal_init(struct journal *j) -+{ -+ static struct lock_class_key res_key; -+ unsigned i; -+ -+ spin_lock_init(&j->lock); -+ spin_lock_init(&j->err_lock); -+ init_waitqueue_head(&j->wait); -+ INIT_DELAYED_WORK(&j->write_work, journal_write_work); -+ init_waitqueue_head(&j->reclaim_wait); -+ init_waitqueue_head(&j->pin_flush_wait); -+ mutex_init(&j->reclaim_lock); -+ mutex_init(&j->discard_lock); -+ -+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); -+ -+ atomic64_set(&j->reservations.counter, -+ ((union journal_res_state) -+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -+ -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) -+ return -BCH_ERR_ENOMEM_journal_pin_fifo; -+ -+ for (i = 0; i < ARRAY_SIZE(j->buf); i++) { -+ j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); -+ if (!j->buf[i].data) -+ return -BCH_ERR_ENOMEM_journal_buf; -+ } -+ -+ j->pin.front = j->pin.back = 1; -+ return 0; -+} -+ -+/* debug: */ -+ -+void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ union journal_res_state s; -+ struct bch_dev *ca; -+ unsigned long now = jiffies; -+ u64 seq; -+ unsigned i; -+ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 24); -+ out->atomic++; -+ -+ rcu_read_lock(); -+ s = READ_ONCE(j->reservations); -+ -+ prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); -+ prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); -+ prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); -+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); -+ prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); -+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); -+ prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); -+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); -+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); -+ prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); -+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); -+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); -+ prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); -+ prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); -+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) -+ ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); -+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); -+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); -+ prt_printf(out, "current entry:\t\t"); -+ -+ switch (s.cur_entry_offset) { -+ case JOURNAL_ENTRY_ERROR_VAL: -+ prt_printf(out, "error"); -+ break; -+ case JOURNAL_ENTRY_CLOSED_VAL: -+ prt_printf(out, "closed"); -+ break; -+ default: -+ prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); -+ break; -+ } -+ -+ prt_newline(out); -+ -+ for (seq = journal_cur_seq(j); -+ seq >= journal_last_unwritten_seq(j); -+ --seq) { -+ i = seq & JOURNAL_BUF_MASK; -+ -+ prt_printf(out, "unwritten entry:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", seq); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "refcount:"); -+ prt_tab(out); -+ prt_printf(out, "%u", journal_state_count(s, i)); -+ prt_newline(out); -+ -+ prt_printf(out, "sectors:"); -+ prt_tab(out); -+ prt_printf(out, "%u", j->buf[i].sectors); -+ prt_newline(out); -+ -+ prt_printf(out, "expires"); -+ prt_tab(out); -+ prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); -+ prt_newline(out); -+ -+ printbuf_indent_sub(out, 2); -+ } -+ -+ prt_printf(out, -+ "replay done:\t\t%i\n", -+ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); -+ -+ prt_printf(out, "space:\n"); -+ prt_printf(out, "\tdiscarded\t%u:%u\n", -+ j->space[journal_space_discarded].next_entry, -+ j->space[journal_space_discarded].total); -+ prt_printf(out, "\tclean ondisk\t%u:%u\n", -+ j->space[journal_space_clean_ondisk].next_entry, -+ j->space[journal_space_clean_ondisk].total); -+ prt_printf(out, "\tclean\t\t%u:%u\n", -+ j->space[journal_space_clean].next_entry, -+ j->space[journal_space_clean].total); -+ prt_printf(out, "\ttotal\t\t%u:%u\n", -+ j->space[journal_space_total].next_entry, -+ j->space[journal_space_total].total); -+ -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) -+ continue; -+ -+ if (!ja->nr) -+ continue; -+ -+ prt_printf(out, "dev %u:\n", i); -+ prt_printf(out, "\tnr\t\t%u\n", ja->nr); -+ prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); -+ prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); -+ prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); -+ prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); -+ prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); -+ prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); -+ } -+ -+ rcu_read_unlock(); -+ -+ --out->atomic; -+} -+ -+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -+{ -+ spin_lock(&j->lock); -+ __bch2_journal_debug_to_text(out, j); -+ spin_unlock(&j->lock); -+} -+ -+bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *pin; -+ unsigned i; -+ -+ spin_lock(&j->lock); -+ *seq = max(*seq, j->pin.front); -+ -+ if (*seq >= j->pin.back) { -+ spin_unlock(&j->lock); -+ return true; -+ } -+ -+ out->atomic++; -+ -+ pin_list = journal_seq_pin(j, *seq); -+ -+ prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) -+ list_for_each_entry(pin, &pin_list->list[i], list) { -+ prt_printf(out, "\t%px %ps", pin, pin->flush); -+ prt_newline(out); -+ } -+ -+ if (!list_empty(&pin_list->flushed)) { -+ prt_printf(out, "flushed:"); -+ prt_newline(out); -+ } -+ -+ list_for_each_entry(pin, &pin_list->flushed, list) { -+ prt_printf(out, "\t%px %ps", pin, pin->flush); -+ prt_newline(out); -+ } -+ -+ printbuf_indent_sub(out, 2); -+ -+ --out->atomic; -+ spin_unlock(&j->lock); -+ -+ return false; -+} -+ -+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -+{ -+ u64 seq = 0; -+ -+ while (!bch2_journal_seq_pins_to_text(out, j, &seq)) -+ seq++; -+} -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -new file mode 100644 -index 000000000..008a2e25a ---- /dev/null -+++ b/fs/bcachefs/journal.h -@@ -0,0 +1,526 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_H -+#define _BCACHEFS_JOURNAL_H -+ -+/* -+ * THE JOURNAL: -+ * -+ * The primary purpose of the journal is to log updates (insertions) to the -+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. -+ * -+ * Without the journal, the b-tree is always internally consistent on -+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal -+ * but did handle unclean shutdowns by doing all index updates synchronously -+ * (with coalescing). -+ * -+ * Updates to interior nodes still happen synchronously and without the journal -+ * (for simplicity) - this may change eventually but updates to interior nodes -+ * are rare enough it's not a huge priority. -+ * -+ * This means the journal is relatively separate from the b-tree; it consists of -+ * just a list of keys and journal replay consists of just redoing those -+ * insertions in same order that they appear in the journal. -+ * -+ * PERSISTENCE: -+ * -+ * For synchronous updates (where we're waiting on the index update to hit -+ * disk), the journal entry will be written out immediately (or as soon as -+ * possible, if the write for the previous journal entry was still in flight). -+ * -+ * Synchronous updates are specified by passing a closure (@flush_cl) to -+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter -+ * down to the journalling code. That closure will wait on the journal write to -+ * complete (via closure_wait()). -+ * -+ * If the index update wasn't synchronous, the journal entry will be -+ * written out after 10 ms have elapsed, by default (the delay_ms field -+ * in struct journal). -+ * -+ * JOURNAL ENTRIES: -+ * -+ * A journal entry is variable size (struct jset), it's got a fixed length -+ * header and then a variable number of struct jset_entry entries. -+ * -+ * Journal entries are identified by monotonically increasing 64 bit sequence -+ * numbers - jset->seq; other places in the code refer to this sequence number. -+ * -+ * A jset_entry entry contains one or more bkeys (which is what gets inserted -+ * into the b-tree). We need a container to indicate which b-tree the key is -+ * for; also, the roots of the various b-trees are stored in jset_entry entries -+ * (one for each b-tree) - this lets us add new b-tree types without changing -+ * the on disk format. -+ * -+ * We also keep some things in the journal header that are logically part of the -+ * superblock - all the things that are frequently updated. This is for future -+ * bcache on raw flash support; the superblock (which will become another -+ * journal) can't be moved or wear leveled, so it contains just enough -+ * information to find the main journal, and the superblock only has to be -+ * rewritten when we want to move/wear level the main journal. -+ * -+ * JOURNAL LAYOUT ON DISK: -+ * -+ * The journal is written to a ringbuffer of buckets (which is kept in the -+ * superblock); the individual buckets are not necessarily contiguous on disk -+ * which means that journal entries are not allowed to span buckets, but also -+ * that we can resize the journal at runtime if desired (unimplemented). -+ * -+ * The journal buckets exist in the same pool as all the other buckets that are -+ * managed by the allocator and garbage collection - garbage collection marks -+ * the journal buckets as metadata buckets. -+ * -+ * OPEN/DIRTY JOURNAL ENTRIES: -+ * -+ * Open/dirty journal entries are journal entries that contain b-tree updates -+ * that have not yet been written out to the b-tree on disk. We have to track -+ * which journal entries are dirty, and we also have to avoid wrapping around -+ * the journal and overwriting old but still dirty journal entries with new -+ * journal entries. -+ * -+ * On disk, this is represented with the "last_seq" field of struct jset; -+ * last_seq is the first sequence number that journal replay has to replay. -+ * -+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in -+ * journal_device->seq) of for each journal bucket, the highest sequence number -+ * any journal entry it contains. Then, by comparing that against last_seq we -+ * can determine whether that journal bucket contains dirty journal entries or -+ * not. -+ * -+ * To track which journal entries are dirty, we maintain a fifo of refcounts -+ * (where each entry corresponds to a specific sequence number) - when a ref -+ * goes to 0, that journal entry is no longer dirty. -+ * -+ * Journalling of index updates is done at the same time as the b-tree itself is -+ * being modified (see btree_insert_key()); when we add the key to the journal -+ * the pending b-tree write takes a ref on the journal entry the key was added -+ * to. If a pending b-tree write would need to take refs on multiple dirty -+ * journal entries, it only keeps the ref on the oldest one (since a newer -+ * journal entry will still be replayed if an older entry was dirty). -+ * -+ * JOURNAL FILLING UP: -+ * -+ * There are two ways the journal could fill up; either we could run out of -+ * space to write to, or we could have too many open journal entries and run out -+ * of room in the fifo of refcounts. Since those refcounts are decremented -+ * without any locking we can't safely resize that fifo, so we handle it the -+ * same way. -+ * -+ * If the journal fills up, we start flushing dirty btree nodes until we can -+ * allocate space for a journal write again - preferentially flushing btree -+ * nodes that are pinning the oldest journal entries first. -+ */ -+ -+#include -+ -+#include "journal_types.h" -+ -+struct bch_fs; -+ -+static inline void journal_wake(struct journal *j) -+{ -+ wake_up(&j->wait); -+ closure_wake_up(&j->async_wait); -+ closure_wake_up(&j->preres_wait); -+} -+ -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ return j->buf + j->reservations.idx; -+} -+ -+/* Sequence number of oldest dirty journal entry */ -+ -+static inline u64 journal_last_seq(struct journal *j) -+{ -+ return j->pin.front; -+} -+ -+static inline u64 journal_cur_seq(struct journal *j) -+{ -+ EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); -+ -+ return j->pin.back - 1; -+} -+ -+static inline u64 journal_last_unwritten_seq(struct journal *j) -+{ -+ return j->seq_ondisk + 1; -+} -+ -+static inline int journal_state_count(union journal_res_state s, int idx) -+{ -+ switch (idx) { -+ case 0: return s.buf0_count; -+ case 1: return s.buf1_count; -+ case 2: return s.buf2_count; -+ case 3: return s.buf3_count; -+ } -+ BUG(); -+} -+ -+static inline void journal_state_inc(union journal_res_state *s) -+{ -+ s->buf0_count += s->idx == 0; -+ s->buf1_count += s->idx == 1; -+ s->buf2_count += s->idx == 2; -+ s->buf3_count += s->idx == 3; -+} -+ -+/* -+ * Amount of space that will be taken up by some keys in the journal (i.e. -+ * including the jset header) -+ */ -+static inline unsigned jset_u64s(unsigned u64s) -+{ -+ return u64s + sizeof(struct jset_entry) / sizeof(u64); -+} -+ -+static inline int journal_entry_overhead(struct journal *j) -+{ -+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -+} -+ -+static inline struct jset_entry * -+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -+{ -+ struct jset *jset = buf->data; -+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); -+ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ -+ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -+ -+ return entry; -+} -+ -+static inline struct jset_entry * -+journal_res_entry(struct journal *j, struct journal_res *res) -+{ -+ return vstruct_idx(j->buf[res->idx].data, res->offset); -+} -+ -+static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, -+ enum btree_id id, unsigned level, -+ unsigned u64s) -+{ -+ entry->u64s = cpu_to_le16(u64s); -+ entry->btree_id = id; -+ entry->level = level; -+ entry->type = type; -+ entry->pad[0] = 0; -+ entry->pad[1] = 0; -+ entry->pad[2] = 0; -+ return jset_u64s(u64s); -+} -+ -+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, -+ enum btree_id id, unsigned level, -+ const void *data, unsigned u64s) -+{ -+ unsigned ret = journal_entry_init(entry, type, id, level, u64s); -+ -+ memcpy_u64s_small(entry->_data, data, u64s); -+ return ret; -+} -+ -+static inline struct jset_entry * -+bch2_journal_add_entry(struct journal *j, struct journal_res *res, -+ unsigned type, enum btree_id id, -+ unsigned level, unsigned u64s) -+{ -+ struct jset_entry *entry = journal_res_entry(j, res); -+ unsigned actual = journal_entry_init(entry, type, id, level, u64s); -+ -+ EBUG_ON(!res->ref); -+ EBUG_ON(actual > res->u64s); -+ -+ res->offset += actual; -+ res->u64s -= actual; -+ return entry; -+} -+ -+static inline bool journal_entry_empty(struct jset *j) -+{ -+ struct jset_entry *i; -+ -+ if (j->seq != j->last_seq) -+ return false; -+ -+ vstruct_for_each(j, i) -+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) -+ return false; -+ return true; -+} -+ -+void __bch2_journal_buf_put(struct journal *); -+ -+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) -+{ -+ union journal_res_state s; -+ -+ s.v = atomic64_sub_return(((union journal_res_state) { -+ .buf0_count = idx == 0, -+ .buf1_count = idx == 1, -+ .buf2_count = idx == 2, -+ .buf3_count = idx == 3, -+ }).v, &j->reservations.counter); -+ -+ if (!journal_state_count(s, idx) && idx == s.unwritten_idx) -+ __bch2_journal_buf_put(j); -+} -+ -+/* -+ * This function releases the journal write structure so other threads can -+ * then proceed to add their keys as well. -+ */ -+static inline void bch2_journal_res_put(struct journal *j, -+ struct journal_res *res) -+{ -+ if (!res->ref) -+ return; -+ -+ lock_release(&j->res_map, _THIS_IP_); -+ -+ while (res->u64s) -+ bch2_journal_add_entry(j, res, -+ BCH_JSET_ENTRY_btree_keys, -+ 0, 0, 0); -+ -+ bch2_journal_buf_put(j, res->idx); -+ -+ res->ref = 0; -+} -+ -+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, -+ unsigned); -+ -+/* First bits for BCH_WATERMARK: */ -+enum journal_res_flags { -+ __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, -+ __JOURNAL_RES_GET_CHECK, -+}; -+ -+#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) -+#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) -+ -+static inline int journal_res_get_fast(struct journal *j, -+ struct journal_res *res, -+ unsigned flags) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ /* -+ * Check if there is still room in the current journal -+ * entry: -+ */ -+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) -+ return 0; -+ -+ EBUG_ON(!journal_state_count(new, new.idx)); -+ -+ if ((flags & BCH_WATERMARK_MASK) < j->watermark) -+ return 0; -+ -+ new.cur_entry_offset += res->u64s; -+ journal_state_inc(&new); -+ -+ /* -+ * If the refcount would overflow, we have to wait: -+ * XXX - tracepoint this: -+ */ -+ if (!journal_state_count(new, new.idx)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_CHECK) -+ return 1; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ res->ref = true; -+ res->idx = old.idx; -+ res->offset = old.cur_entry_offset; -+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ return 1; -+} -+ -+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned u64s, unsigned flags) -+{ -+ int ret; -+ -+ EBUG_ON(res->ref); -+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ -+ res->u64s = u64s; -+ -+ if (journal_res_get_fast(j, res, flags)) -+ goto out; -+ -+ ret = bch2_journal_res_get_slowpath(j, res, flags); -+ if (ret) -+ return ret; -+out: -+ if (!(flags & JOURNAL_RES_GET_CHECK)) { -+ lock_acquire_shared(&j->res_map, 0, -+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, -+ NULL, _THIS_IP_); -+ EBUG_ON(!res->ref); -+ } -+ return 0; -+} -+ -+/* journal_preres: */ -+ -+static inline void journal_set_watermark(struct journal *j) -+{ -+ union journal_preres_state s = READ_ONCE(j->prereserved); -+ unsigned watermark = BCH_WATERMARK_stripe; -+ -+ if (fifo_free(&j->pin) < j->pin.size / 4) -+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); -+ if (fifo_free(&j->pin) < j->pin.size / 8) -+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); -+ -+ if (s.reserved > s.remaining) -+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); -+ if (!s.remaining) -+ watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); -+ -+ if (watermark == j->watermark) -+ return; -+ -+ swap(watermark, j->watermark); -+ if (watermark > j->watermark) -+ journal_wake(j); -+} -+ -+static inline void bch2_journal_preres_put(struct journal *j, -+ struct journal_preres *res) -+{ -+ union journal_preres_state s = { .reserved = res->u64s }; -+ -+ if (!res->u64s) -+ return; -+ -+ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); -+ res->u64s = 0; -+ -+ if (unlikely(s.waiting)) { -+ clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), -+ (unsigned long *) &j->prereserved.v); -+ closure_wake_up(&j->preres_wait); -+ } -+ -+ if (s.reserved <= s.remaining && j->watermark) -+ journal_set_watermark(j); -+} -+ -+int __bch2_journal_preres_get(struct journal *, -+ struct journal_preres *, unsigned, unsigned); -+ -+static inline int bch2_journal_preres_get_fast(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags, -+ bool set_waiting) -+{ -+ int d = new_u64s - res->u64s; -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; -+ int ret; -+ -+ do { -+ old.v = new.v = v; -+ ret = 0; -+ -+ if (watermark == BCH_WATERMARK_reclaim || -+ new.reserved + d < new.remaining) { -+ new.reserved += d; -+ ret = 1; -+ } else if (set_waiting && !new.waiting) -+ new.waiting = true; -+ else -+ return 0; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+ -+ if (ret) -+ res->u64s += d; -+ return ret; -+} -+ -+static inline int bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ if (new_u64s <= res->u64s) -+ return 0; -+ -+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_NONBLOCK) -+ return -BCH_ERR_journal_preres_get_blocked; -+ -+ return __bch2_journal_preres_get(j, res, new_u64s, flags); -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *, -+ struct journal_entry_res *, -+ unsigned); -+ -+int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -+void bch2_journal_flush_async(struct journal *, struct closure *); -+ -+int bch2_journal_flush_seq(struct journal *, u64); -+int bch2_journal_flush(struct journal *); -+bool bch2_journal_noflush_seq(struct journal *, u64); -+int bch2_journal_meta(struct journal *); -+ -+void bch2_journal_halt(struct journal *); -+ -+static inline int bch2_journal_error(struct journal *j) -+{ -+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL -+ ? -EIO : 0; -+} -+ -+struct bch_dev; -+ -+static inline void bch2_journal_set_replay_done(struct journal *j) -+{ -+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ set_bit(JOURNAL_REPLAY_DONE, &j->flags); -+} -+ -+void bch2_journal_unblock(struct journal *); -+void bch2_journal_block(struct journal *); -+ -+void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); -+void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -+void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -+bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); -+ -+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, -+ unsigned nr); -+int bch2_dev_journal_alloc(struct bch_dev *); -+ -+void bch2_dev_journal_stop(struct journal *, struct bch_dev *); -+ -+void bch2_fs_journal_stop(struct journal *); -+int bch2_fs_journal_start(struct journal *, u64); -+ -+void bch2_dev_journal_exit(struct bch_dev *); -+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -+void bch2_fs_journal_exit(struct journal *); -+int bch2_fs_journal_init(struct journal *); -+ -+#endif /* _BCACHEFS_JOURNAL_H */ -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -new file mode 100644 -index 000000000..34740dca4 ---- /dev/null -+++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1888 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_io.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "replicas.h" -+#include "sb-clean.h" -+#include "trace.h" -+ -+static struct nonce journal_nonce(const struct jset *jset) -+{ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = ((__le32 *) &jset->seq)[0], -+ [2] = ((__le32 *) &jset->seq)[1], -+ [3] = BCH_NONCE_JOURNAL, -+ }}; -+} -+ -+static bool jset_csum_good(struct bch_fs *c, struct jset *j) -+{ -+ return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && -+ !bch2_crc_cmp(j->csum, -+ csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); -+} -+ -+static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) -+{ -+ return (seq - c->journal_entries_base_seq) & (~0U >> 1); -+} -+ -+static void __journal_replay_free(struct bch_fs *c, -+ struct journal_replay *i) -+{ -+ struct journal_replay **p = -+ genradix_ptr(&c->journal_entries, -+ journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); -+ -+ BUG_ON(*p != i); -+ *p = NULL; -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+} -+ -+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) -+{ -+ i->ignore = true; -+ -+ if (!c->opts.read_entire_journal) -+ __journal_replay_free(c, i); -+} -+ -+struct journal_list { -+ struct closure cl; -+ u64 last_seq; -+ struct mutex lock; -+ int ret; -+}; -+ -+#define JOURNAL_ENTRY_ADD_OK 0 -+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 -+ -+/* -+ * Given a journal entry we just read, add it to the list of journal entries to -+ * be replayed: -+ */ -+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, -+ struct journal_ptr entry_ptr, -+ struct journal_list *jlist, struct jset *j) -+{ -+ struct genradix_iter iter; -+ struct journal_replay **_i, *i, *dup; -+ struct journal_ptr *ptr; -+ size_t bytes = vstruct_bytes(j); -+ u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; -+ int ret = JOURNAL_ENTRY_ADD_OK; -+ -+ /* Is this entry older than the range we need? */ -+ if (!c->opts.read_entire_journal && -+ le64_to_cpu(j->seq) < jlist->last_seq) -+ return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; -+ -+ /* -+ * genradixes are indexed by a ulong, not a u64, so we can't index them -+ * by sequence number directly: Assume instead that they will all fall -+ * within the range of +-2billion of the filrst one we find. -+ */ -+ if (!c->journal_entries_base_seq) -+ c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); -+ -+ /* Drop entries we don't need anymore */ -+ if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { -+ genradix_for_each_from(&c->journal_entries, iter, _i, -+ journal_entry_radix_idx(c, jlist->last_seq)) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ if (le64_to_cpu(i->j.seq) >= last_seq) -+ break; -+ journal_replay_free(c, i); -+ } -+ } -+ -+ jlist->last_seq = max(jlist->last_seq, last_seq); -+ -+ _i = genradix_ptr_alloc(&c->journal_entries, -+ journal_entry_radix_idx(c, le64_to_cpu(j->seq)), -+ GFP_KERNEL); -+ if (!_i) -+ return -BCH_ERR_ENOMEM_journal_entry_add; -+ -+ /* -+ * Duplicate journal entries? If so we want the one that didn't have a -+ * checksum error: -+ */ -+ dup = *_i; -+ if (dup) { -+ if (bytes == vstruct_bytes(&dup->j) && -+ !memcmp(j, &dup->j, bytes)) { -+ i = dup; -+ goto found; -+ } -+ -+ if (!entry_ptr.csum_good) { -+ i = dup; -+ goto found; -+ } -+ -+ if (!dup->csum_good) -+ goto replace; -+ -+ fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", -+ le64_to_cpu(j->seq)); -+ i = dup; -+ goto found; -+ } -+replace: -+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); -+ if (!i) -+ return -BCH_ERR_ENOMEM_journal_entry_add; -+ -+ i->nr_ptrs = 0; -+ i->csum_good = entry_ptr.csum_good; -+ i->ignore = false; -+ unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); -+ i->ptrs[i->nr_ptrs++] = entry_ptr; -+ -+ if (dup) { -+ if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { -+ bch_err(c, "found too many copies of journal entry %llu", -+ le64_to_cpu(i->j.seq)); -+ dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; -+ } -+ -+ /* The first ptr should represent the jset we kept: */ -+ memcpy(i->ptrs + i->nr_ptrs, -+ dup->ptrs, -+ sizeof(dup->ptrs[0]) * dup->nr_ptrs); -+ i->nr_ptrs += dup->nr_ptrs; -+ __journal_replay_free(c, dup); -+ } -+ -+ *_i = i; -+ return 0; -+found: -+ for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { -+ if (ptr->dev == ca->dev_idx) { -+ bch_err(c, "duplicate journal entry %llu on same device", -+ le64_to_cpu(i->j.seq)); -+ goto out; -+ } -+ } -+ -+ if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { -+ bch_err(c, "found too many copies of journal entry %llu", -+ le64_to_cpu(i->j.seq)); -+ goto out; -+ } -+ -+ i->ptrs[i->nr_ptrs++] = entry_ptr; -+out: -+fsck_err: -+ return ret; -+} -+ -+/* this fills in a range with empty jset_entries: */ -+static void journal_entry_null_range(void *start, void *end) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = start; entry != end; entry = vstruct_next(entry)) -+ memset(entry, 0, sizeof(*entry)); -+} -+ -+#define JOURNAL_ENTRY_REREAD 5 -+#define JOURNAL_ENTRY_NONE 6 -+#define JOURNAL_ENTRY_BAD 7 -+ -+static void journal_entry_err_msg(struct printbuf *out, -+ u32 version, -+ struct jset *jset, -+ struct jset_entry *entry) -+{ -+ prt_str(out, "invalid journal entry, version="); -+ bch2_version_to_text(out, version); -+ -+ if (entry) { -+ prt_str(out, " type="); -+ prt_str(out, bch2_jset_entry_types[entry->type]); -+ } -+ -+ if (!jset) { -+ prt_printf(out, " in superblock"); -+ } else { -+ -+ prt_printf(out, " seq=%llu", le64_to_cpu(jset->seq)); -+ -+ if (entry) -+ prt_printf(out, " offset=%zi/%u", -+ (u64 *) entry - jset->_data, -+ le32_to_cpu(jset->u64s)); -+ } -+ -+ prt_str(out, ": "); -+} -+ -+#define journal_entry_err(c, version, jset, entry, msg, ...) \ -+({ \ -+ struct printbuf buf = PRINTBUF; \ -+ \ -+ journal_entry_err_msg(&buf, version, jset, entry); \ -+ prt_printf(&buf, msg, ##__VA_ARGS__); \ -+ \ -+ switch (flags & BKEY_INVALID_WRITE) { \ -+ case READ: \ -+ mustfix_fsck_err(c, "%s", buf.buf); \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = -BCH_ERR_fsck_errors_not_fixed; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+ \ -+ printbuf_exit(&buf); \ -+ true; \ -+}) -+ -+#define journal_entry_err_on(cond, c, version, jset, entry, msg, ...) \ -+ ((cond) ? journal_entry_err(c, version, jset, entry, msg, ##__VA_ARGS__) : false) -+ -+#define FSCK_DELETED_KEY 5 -+ -+static int journal_validate_key(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned level, enum btree_id btree_id, -+ struct bkey_i *k, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ int write = flags & BKEY_INVALID_WRITE; -+ void *next = vstruct_next(entry); -+ struct printbuf buf = PRINTBUF; -+ int ret = 0; -+ -+ if (journal_entry_err_on(!k->k.u64s, c, version, jset, entry, "k->u64s 0")) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return FSCK_DELETED_KEY; -+ } -+ -+ if (journal_entry_err_on((void *) bkey_next(k) > -+ (void *) vstruct_next(entry), -+ c, version, jset, entry, -+ "extends past end of journal entry")) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return FSCK_DELETED_KEY; -+ } -+ -+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, -+ c, version, jset, entry, -+ "bad format %u", k->k.format)) { -+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return FSCK_DELETED_KEY; -+ } -+ -+ if (!write) -+ bch2_bkey_compat(level, btree_id, version, big_endian, -+ write, NULL, bkey_to_packed(k)); -+ -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), -+ __btree_node_type(level, btree_id), write, &buf)) { -+ printbuf_reset(&buf); -+ journal_entry_err_msg(&buf, version, jset, entry); -+ prt_newline(&buf); -+ printbuf_indent_add(&buf, 2); -+ -+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); -+ prt_newline(&buf); -+ bch2_bkey_invalid(c, bkey_i_to_s_c(k), -+ __btree_node_type(level, btree_id), write, &buf); -+ -+ mustfix_fsck_err(c, "%s", buf.buf); -+ -+ le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ -+ printbuf_exit(&buf); -+ return FSCK_DELETED_KEY; -+ } -+ -+ if (write) -+ bch2_bkey_compat(level, btree_id, version, big_endian, -+ write, NULL, bkey_to_packed(k)); -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+static int journal_entry_btree_keys_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct bkey_i *k = entry->start; -+ -+ while (k != vstruct_last(entry)) { -+ int ret = journal_validate_key(c, jset, entry, -+ entry->level, -+ entry->btree_id, -+ k, version, big_endian, -+ flags|BKEY_INVALID_JOURNAL); -+ if (ret == FSCK_DELETED_KEY) -+ continue; -+ -+ k = bkey_next(k); -+ } -+ -+ return 0; -+} -+ -+static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct bkey_i *k; -+ bool first = true; -+ -+ jset_entry_for_each_key(entry, k) { -+ if (!first) { -+ prt_newline(out); -+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); -+ } -+ prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); -+ first = false; -+ } -+} -+ -+static int journal_entry_btree_root_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct bkey_i *k = entry->start; -+ int ret = 0; -+ -+ if (journal_entry_err_on(!entry->u64s || -+ le16_to_cpu(entry->u64s) != k->k.u64s, -+ c, version, jset, entry, -+ "invalid btree root journal entry: wrong number of keys")) { -+ void *next = vstruct_next(entry); -+ /* -+ * we don't want to null out this jset_entry, -+ * just the contents, so that later we can tell -+ * we were _supposed_ to have a btree root -+ */ -+ entry->u64s = 0; -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -+ version, big_endian, flags); -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ journal_entry_btree_keys_to_text(out, c, entry); -+} -+ -+static int journal_entry_prio_ptrs_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ /* obsolete, don't care: */ -+ return 0; -+} -+ -+static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+} -+ -+static int journal_entry_blacklist_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, -+ c, version, jset, entry, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_blacklist *bl = -+ container_of(entry, struct jset_entry_blacklist, entry); -+ -+ prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); -+} -+ -+static int journal_entry_blacklist_v2_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry_blacklist_v2 *bl_entry; -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, -+ c, version, jset, entry, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ goto out; -+ } -+ -+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > -+ le64_to_cpu(bl_entry->end), -+ c, version, jset, entry, -+ "invalid journal seq blacklist entry: start > end")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+out: -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_blacklist_v2 *bl = -+ container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ prt_printf(out, "start=%llu end=%llu", -+ le64_to_cpu(bl->start), -+ le64_to_cpu(bl->end)); -+} -+ -+static int journal_entry_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u), -+ c, version, jset, entry, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ prt_printf(out, "type=%s v=%llu", -+ bch2_fs_usage_types[u->entry.btree_id], -+ le64_to_cpu(u->v)); -+} -+ -+static int journal_entry_data_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u) || -+ bytes < sizeof(*u) + u->r.nr_devs, -+ c, version, jset, entry, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ -+ bch2_replicas_entry_to_text(out, &u->r); -+ prt_printf(out, "=%llu", le64_to_cpu(u->v)); -+} -+ -+static int journal_entry_clock_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry_clock *clock = -+ container_of(entry, struct jset_entry_clock, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes != sizeof(*clock), -+ c, version, jset, entry, "bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+ if (journal_entry_err_on(clock->rw > 1, -+ c, version, jset, entry, "bad rw")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_clock *clock = -+ container_of(entry, struct jset_entry_clock, entry); -+ -+ prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); -+} -+ -+static int journal_entry_dev_usage_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry_dev_usage *u = -+ container_of(entry, struct jset_entry_dev_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ unsigned expected = sizeof(*u); -+ unsigned dev; -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < expected, -+ c, version, jset, entry, "bad size (%u < %u)", -+ bytes, expected)) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+ dev = le32_to_cpu(u->dev); -+ -+ if (journal_entry_err_on(!bch2_dev_exists2(c, dev), -+ c, version, jset, entry, "bad dev")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+ if (journal_entry_err_on(u->pad, -+ c, version, jset, entry, "bad pad")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_dev_usage *u = -+ container_of(entry, struct jset_entry_dev_usage, entry); -+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); -+ -+ prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); -+ -+ for (i = 0; i < nr_types; i++) { -+ if (i < BCH_DATA_NR) -+ prt_printf(out, " %s", bch2_data_types[i]); -+ else -+ prt_printf(out, " (unknown data type %u)", i); -+ prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", -+ le64_to_cpu(u->d[i].buckets), -+ le64_to_cpu(u->d[i].sectors), -+ le64_to_cpu(u->d[i].fragmented)); -+ } -+ -+ prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); -+} -+ -+static int journal_entry_log_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ return 0; -+} -+ -+static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); -+ unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); -+ -+ prt_printf(out, "%.*s", bytes, l->d); -+} -+ -+static int journal_entry_overwrite_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ return journal_entry_btree_keys_validate(c, jset, entry, -+ version, big_endian, READ); -+} -+ -+static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ journal_entry_btree_keys_to_text(out, c, entry); -+} -+ -+struct jset_entry_ops { -+ int (*validate)(struct bch_fs *, struct jset *, -+ struct jset_entry *, unsigned, int, -+ enum bkey_invalid_flags); -+ void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); -+}; -+ -+static const struct jset_entry_ops bch2_jset_entry_ops[] = { -+#define x(f, nr) \ -+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ -+ .validate = journal_entry_##f##_validate, \ -+ .to_text = journal_entry_##f##_to_text, \ -+ }, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+}; -+ -+int bch2_journal_entry_validate(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ unsigned version, int big_endian, -+ enum bkey_invalid_flags flags) -+{ -+ return entry->type < BCH_JSET_ENTRY_NR -+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, -+ version, big_endian, flags) -+ : 0; -+} -+ -+void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ if (entry->type < BCH_JSET_ENTRY_NR) { -+ prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); -+ bch2_jset_entry_ops[entry->type].to_text(out, c, entry); -+ } else { -+ prt_printf(out, "(unknown type %u)", entry->type); -+ } -+} -+ -+static int jset_validate_entries(struct bch_fs *c, struct jset *jset, -+ enum bkey_invalid_flags flags) -+{ -+ struct jset_entry *entry; -+ unsigned version = le32_to_cpu(jset->version); -+ int ret = 0; -+ -+ vstruct_for_each(jset, entry) { -+ if (journal_entry_err_on(vstruct_next(entry) > vstruct_last(jset), -+ c, version, jset, entry, -+ "journal entry extends past end of jset")) { -+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); -+ break; -+ } -+ -+ ret = bch2_journal_entry_validate(c, jset, entry, -+ version, JSET_BIG_ENDIAN(jset), flags); -+ if (ret) -+ break; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int jset_validate(struct bch_fs *c, -+ struct bch_dev *ca, -+ struct jset *jset, u64 sector, -+ enum bkey_invalid_flags flags) -+{ -+ unsigned version; -+ int ret = 0; -+ -+ if (le64_to_cpu(jset->magic) != jset_magic(c)) -+ return JOURNAL_ENTRY_NONE; -+ -+ version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on(!bch2_version_compatible(version), -+ c, version, jset, NULL, -+ "%s sector %llu seq %llu: incompatible journal entry version %u.%u", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), -+ BCH_VERSION_MAJOR(version), -+ BCH_VERSION_MINOR(version))) { -+ /* don't try to continue: */ -+ return -EINVAL; -+ } -+ -+ if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), -+ c, version, jset, NULL, -+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), -+ JSET_CSUM_TYPE(jset))) -+ ret = JOURNAL_ENTRY_BAD; -+ -+ /* last_seq is ignored when JSET_NO_FLUSH is true */ -+ if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && -+ le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), -+ c, version, jset, NULL, -+ "invalid journal entry: last_seq > seq (%llu > %llu)", -+ le64_to_cpu(jset->last_seq), -+ le64_to_cpu(jset->seq))) { -+ jset->last_seq = jset->seq; -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ ret = jset_validate_entries(c, jset, flags); -+fsck_err: -+ return ret; -+} -+ -+static int jset_validate_early(struct bch_fs *c, -+ struct bch_dev *ca, -+ struct jset *jset, u64 sector, -+ unsigned bucket_sectors_left, -+ unsigned sectors_read) -+{ -+ size_t bytes = vstruct_bytes(jset); -+ unsigned version; -+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; -+ int ret = 0; -+ -+ if (le64_to_cpu(jset->magic) != jset_magic(c)) -+ return JOURNAL_ENTRY_NONE; -+ -+ version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on(!bch2_version_compatible(version), -+ c, version, jset, NULL, -+ "%s sector %llu seq %llu: unknown journal entry version %u.%u", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), -+ BCH_VERSION_MAJOR(version), -+ BCH_VERSION_MINOR(version))) { -+ /* don't try to continue: */ -+ return -EINVAL; -+ } -+ -+ if (bytes > (sectors_read << 9) && -+ sectors_read < bucket_sectors_left) -+ return JOURNAL_ENTRY_REREAD; -+ -+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, -+ c, version, jset, NULL, -+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", -+ ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), bytes)) -+ le32_add_cpu(&jset->u64s, -+ -((bytes - (bucket_sectors_left << 9)) / 8)); -+fsck_err: -+ return ret; -+} -+ -+struct journal_read_buf { -+ void *data; -+ size_t size; -+}; -+ -+static int journal_read_buf_realloc(struct journal_read_buf *b, -+ size_t new_size) -+{ -+ void *n; -+ -+ /* the bios are sized for this many pages, max: */ -+ if (new_size > JOURNAL_ENTRY_SIZE_MAX) -+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = kvpmalloc(new_size, GFP_KERNEL); -+ if (!n) -+ return -BCH_ERR_ENOMEM_journal_read_buf_realloc; -+ -+ kvpfree(b->data, b->size); -+ b->data = n; -+ b->size = new_size; -+ return 0; -+} -+ -+static int journal_read_bucket(struct bch_dev *ca, -+ struct journal_read_buf *buf, -+ struct journal_list *jlist, -+ unsigned bucket) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct jset *j = NULL; -+ unsigned sectors, sectors_read = 0; -+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), -+ end = offset + ca->mi.bucket_size; -+ bool saw_bad = false, csum_good; -+ int ret = 0; -+ -+ pr_debug("reading %u", bucket); -+ -+ while (offset < end) { -+ if (!sectors_read) { -+ struct bio *bio; -+ unsigned nr_bvecs; -+reread: -+ sectors_read = min_t(unsigned, -+ end - offset, buf->size >> 9); -+ nr_bvecs = buf_pages(buf->data, sectors_read << 9); -+ -+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); -+ bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); -+ -+ bio->bi_iter.bi_sector = offset; -+ bch2_bio_map(bio, buf->data, sectors_read << 9); -+ -+ ret = submit_bio_wait(bio); -+ kfree(bio); -+ -+ if (bch2_dev_io_err_on(ret, ca, -+ "journal read error: sector %llu", -+ offset) || -+ bch2_meta_read_fault("journal")) { -+ /* -+ * We don't error out of the recovery process -+ * here, since the relevant journal entry may be -+ * found on a different device, and missing or -+ * no journal entries will be handled later -+ */ -+ return 0; -+ } -+ -+ j = buf->data; -+ } -+ -+ ret = jset_validate_early(c, ca, j, offset, -+ end - offset, sectors_read); -+ switch (ret) { -+ case 0: -+ sectors = vstruct_sectors(j, c->block_bits); -+ break; -+ case JOURNAL_ENTRY_REREAD: -+ if (vstruct_bytes(j) > buf->size) { -+ ret = journal_read_buf_realloc(buf, -+ vstruct_bytes(j)); -+ if (ret) -+ return ret; -+ } -+ goto reread; -+ case JOURNAL_ENTRY_NONE: -+ if (!saw_bad) -+ return 0; -+ /* -+ * On checksum error we don't really trust the size -+ * field of the journal entry we read, so try reading -+ * again at next block boundary: -+ */ -+ sectors = block_sectors(c); -+ goto next_block; -+ default: -+ return ret; -+ } -+ -+ /* -+ * This happens sometimes if we don't have discards on - -+ * when we've partially overwritten a bucket with new -+ * journal entries. We don't need the rest of the -+ * bucket: -+ */ -+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) -+ return 0; -+ -+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); -+ -+ csum_good = jset_csum_good(c, j); -+ if (!csum_good) -+ saw_bad = true; -+ -+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), -+ j->encrypted_start, -+ vstruct_end(j) - (void *) j->encrypted_start); -+ bch2_fs_fatal_err_on(ret, c, -+ "error decrypting journal entry: %i", ret); -+ -+ mutex_lock(&jlist->lock); -+ ret = journal_entry_add(c, ca, (struct journal_ptr) { -+ .csum_good = csum_good, -+ .dev = ca->dev_idx, -+ .bucket = bucket, -+ .bucket_offset = offset - -+ bucket_to_sector(ca, ja->buckets[bucket]), -+ .sector = offset, -+ }, jlist, j); -+ mutex_unlock(&jlist->lock); -+ -+ switch (ret) { -+ case JOURNAL_ENTRY_ADD_OK: -+ break; -+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: -+ break; -+ default: -+ return ret; -+ } -+next_block: -+ pr_debug("next"); -+ offset += sectors; -+ sectors_read -= sectors; -+ j = ((void *) j) + (sectors << 9); -+ } -+ -+ return 0; -+} -+ -+static void bch2_journal_read_device(struct closure *cl) -+{ -+ struct journal_device *ja = -+ container_of(cl, struct journal_device, read); -+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); -+ struct bch_fs *c = ca->fs; -+ struct journal_list *jlist = -+ container_of(cl->parent, struct journal_list, cl); -+ struct journal_replay *r, **_r; -+ struct genradix_iter iter; -+ struct journal_read_buf buf = { NULL, 0 }; -+ unsigned i; -+ int ret = 0; -+ -+ if (!ja->nr) -+ goto out; -+ -+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); -+ if (ret) -+ goto err; -+ -+ pr_debug("%u journal buckets", ja->nr); -+ -+ for (i = 0; i < ja->nr; i++) { -+ ret = journal_read_bucket(ca, &buf, jlist, i); -+ if (ret) -+ goto err; -+ } -+ -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ mutex_lock(&jlist->lock); -+ genradix_for_each_reverse(&c->journal_entries, iter, _r) { -+ r = *_r; -+ -+ if (!r) -+ continue; -+ -+ for (i = 0; i < r->nr_ptrs; i++) { -+ if (r->ptrs[i].dev == ca->dev_idx) { -+ unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + -+ vstruct_sectors(&r->j, c->block_bits); -+ -+ ja->cur_idx = r->ptrs[i].bucket; -+ ja->sectors_free = ca->mi.bucket_size - wrote; -+ goto found; -+ } -+ } -+ } -+found: -+ mutex_unlock(&jlist->lock); -+ -+ if (ja->bucket_seq[ja->cur_idx] && -+ ja->sectors_free == ca->mi.bucket_size) { -+ bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); -+ bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); -+ for (i = 0; i < 3; i++) { -+ unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; -+ -+ bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); -+ } -+ ja->sectors_free = 0; -+ } -+ -+ /* -+ * Set dirty_idx to indicate the entire journal is full and needs to be -+ * reclaimed - journal reclaim will immediately reclaim whatever isn't -+ * pinned when it first runs: -+ */ -+ ja->discard_idx = ja->dirty_idx_ondisk = -+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -+out: -+ bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); -+ kvpfree(buf.data, buf.size); -+ percpu_ref_put(&ca->io_ref); -+ closure_return(cl); -+ return; -+err: -+ mutex_lock(&jlist->lock); -+ jlist->ret = ret; -+ mutex_unlock(&jlist->lock); -+ goto out; -+} -+ -+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct journal_replay *j) -+{ -+ unsigned i; -+ -+ for (i = 0; i < j->nr_ptrs; i++) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); -+ u64 offset; -+ -+ div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); -+ -+ if (i) -+ prt_printf(out, " "); -+ prt_printf(out, "%u:%u:%u (sector %llu)", -+ j->ptrs[i].dev, -+ j->ptrs[i].bucket, -+ j->ptrs[i].bucket_offset, -+ j->ptrs[i].sector); -+ } -+} -+ -+int bch2_journal_read(struct bch_fs *c, -+ u64 *last_seq, -+ u64 *blacklist_seq, -+ u64 *start_seq) -+{ -+ struct journal_list jlist; -+ struct journal_replay *i, **_i, *prev = NULL; -+ struct genradix_iter radix_iter; -+ struct bch_dev *ca; -+ unsigned iter; -+ struct printbuf buf = PRINTBUF; -+ bool degraded = false, last_write_torn = false; -+ u64 seq; -+ int ret = 0; -+ -+ closure_init_stack(&jlist.cl); -+ mutex_init(&jlist.lock); -+ jlist.last_seq = 0; -+ jlist.ret = 0; -+ -+ for_each_member_device(ca, c, iter) { -+ if (!c->opts.fsck && -+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) -+ continue; -+ -+ if ((ca->mi.state == BCH_MEMBER_STATE_rw || -+ ca->mi.state == BCH_MEMBER_STATE_ro) && -+ percpu_ref_tryget(&ca->io_ref)) -+ closure_call(&ca->journal.read, -+ bch2_journal_read_device, -+ system_unbound_wq, -+ &jlist.cl); -+ else -+ degraded = true; -+ } -+ -+ closure_sync(&jlist.cl); -+ -+ if (jlist.ret) -+ return jlist.ret; -+ -+ *last_seq = 0; -+ *start_seq = 0; -+ *blacklist_seq = 0; -+ -+ /* -+ * Find most recent flush entry, and ignore newer non flush entries - -+ * those entries will be blacklisted: -+ */ -+ genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { -+ enum bkey_invalid_flags flags = BKEY_INVALID_JOURNAL; -+ -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ if (!*start_seq) -+ *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; -+ -+ if (JSET_NO_FLUSH(&i->j)) { -+ i->ignore = true; -+ continue; -+ } -+ -+ if (!last_write_torn && !i->csum_good) { -+ last_write_torn = true; -+ i->ignore = true; -+ continue; -+ } -+ -+ if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), -+ c, le32_to_cpu(i->j.version), &i->j, NULL, -+ "invalid journal entry: last_seq > seq (%llu > %llu)", -+ le64_to_cpu(i->j.last_seq), -+ le64_to_cpu(i->j.seq))) -+ i->j.last_seq = i->j.seq; -+ -+ *last_seq = le64_to_cpu(i->j.last_seq); -+ *blacklist_seq = le64_to_cpu(i->j.seq) + 1; -+ break; -+ } -+ -+ if (!*start_seq) { -+ bch_info(c, "journal read done, but no entries found"); -+ return 0; -+ } -+ -+ if (!*last_seq) { -+ fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); -+ return 0; -+ } -+ -+ bch_info(c, "journal read done, replaying entries %llu-%llu", -+ *last_seq, *blacklist_seq - 1); -+ -+ if (*start_seq != *blacklist_seq) -+ bch_info(c, "dropped unflushed entries %llu-%llu", -+ *blacklist_seq, *start_seq - 1); -+ -+ /* Drop blacklisted entries and entries older than last_seq: */ -+ genradix_for_each(&c->journal_entries, radix_iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ seq = le64_to_cpu(i->j.seq); -+ if (seq < *last_seq) { -+ journal_replay_free(c, i); -+ continue; -+ } -+ -+ if (bch2_journal_seq_is_blacklisted(c, seq, true)) { -+ fsck_err_on(!JSET_NO_FLUSH(&i->j), c, -+ "found blacklisted journal entry %llu", seq); -+ i->ignore = true; -+ } -+ } -+ -+ /* Check for missing entries: */ -+ seq = *last_seq; -+ genradix_for_each(&c->journal_entries, radix_iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ BUG_ON(seq > le64_to_cpu(i->j.seq)); -+ -+ while (seq < le64_to_cpu(i->j.seq)) { -+ u64 missing_start, missing_end; -+ struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; -+ -+ while (seq < le64_to_cpu(i->j.seq) && -+ bch2_journal_seq_is_blacklisted(c, seq, false)) -+ seq++; -+ -+ if (seq == le64_to_cpu(i->j.seq)) -+ break; -+ -+ missing_start = seq; -+ -+ while (seq < le64_to_cpu(i->j.seq) && -+ !bch2_journal_seq_is_blacklisted(c, seq, false)) -+ seq++; -+ -+ if (prev) { -+ bch2_journal_ptrs_to_text(&buf1, c, prev); -+ prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); -+ } else -+ prt_printf(&buf1, "(none)"); -+ bch2_journal_ptrs_to_text(&buf2, c, i); -+ -+ missing_end = seq - 1; -+ fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" -+ " prev at %s\n" -+ " next at %s", -+ missing_start, missing_end, -+ *last_seq, *blacklist_seq - 1, -+ buf1.buf, buf2.buf); -+ -+ printbuf_exit(&buf1); -+ printbuf_exit(&buf2); -+ } -+ -+ prev = i; -+ seq++; -+ } -+ -+ genradix_for_each(&c->journal_entries, radix_iter, _i) { -+ struct bch_replicas_padded replicas = { -+ .e.data_type = BCH_DATA_journal, -+ .e.nr_required = 1, -+ }; -+ unsigned ptr; -+ -+ i = *_i; -+ if (!i || i->ignore) -+ continue; -+ -+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); -+ -+ if (!i->ptrs[ptr].csum_good) -+ bch_err_dev_offset(ca, i->ptrs[ptr].sector, -+ "invalid journal checksum, seq %llu%s", -+ le64_to_cpu(i->j.seq), -+ i->csum_good ? " (had good copy on another device)" : ""); -+ } -+ -+ ret = jset_validate(c, -+ bch_dev_bkey_exists(c, i->ptrs[0].dev), -+ &i->j, -+ i->ptrs[0].sector, -+ READ); -+ if (ret) -+ goto err; -+ -+ for (ptr = 0; ptr < i->nr_ptrs; ptr++) -+ replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; -+ -+ bch2_replicas_entry_sort(&replicas.e); -+ -+ printbuf_reset(&buf); -+ bch2_replicas_entry_to_text(&buf, &replicas.e); -+ -+ if (!degraded && -+ !bch2_replicas_marked(c, &replicas.e) && -+ (le64_to_cpu(i->j.seq) == *last_seq || -+ fsck_err(c, "superblock not marked as containing replicas for journal entry %llu\n %s", -+ le64_to_cpu(i->j.seq), buf.buf))) { -+ ret = bch2_mark_replicas(c, &replicas.e); -+ if (ret) -+ goto err; -+ } -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* journal write: */ -+ -+static void __journal_write_alloc(struct journal *j, -+ struct journal_buf *w, -+ struct dev_alloc_list *devs_sorted, -+ unsigned sectors, -+ unsigned *replicas, -+ unsigned replicas_want) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (*replicas >= replicas_want) -+ return; -+ -+ for (i = 0; i < devs_sorted->nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ /* -+ * Check that we can use this device, and aren't already using -+ * it: -+ */ -+ if (!ca->mi.durability || -+ ca->mi.state != BCH_MEMBER_STATE_rw || -+ !ja->nr || -+ bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || -+ sectors > ja->sectors_free) -+ continue; -+ -+ bch2_dev_stripe_increment(ca, &j->wp.stripe); -+ -+ bch2_bkey_append_ptr(&w->key, -+ (struct bch_extent_ptr) { -+ .offset = bucket_to_sector(ca, -+ ja->buckets[ja->cur_idx]) + -+ ca->mi.bucket_size - -+ ja->sectors_free, -+ .dev = ca->dev_idx, -+ }); -+ -+ ja->sectors_free -= sectors; -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ -+ *replicas += ca->mi.durability; -+ -+ if (*replicas >= replicas_want) -+ break; -+ } -+} -+ -+/** -+ * journal_next_bucket - move on to the next journal bucket if possible -+ */ -+static int journal_write_alloc(struct journal *j, struct journal_buf *w, -+ unsigned sectors) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_devs_mask devs; -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ struct dev_alloc_list devs_sorted; -+ unsigned target = c->opts.metadata_target ?: -+ c->opts.foreground_target; -+ unsigned i, replicas = 0, replicas_want = -+ READ_ONCE(c->opts.metadata_replicas); -+ -+ rcu_read_lock(); -+retry: -+ devs = target_rw_devs(c, BCH_DATA_journal, target); -+ -+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+ -+ if (replicas >= replicas_want) -+ goto done; -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ if (sectors > ja->sectors_free && -+ sectors <= ca->mi.bucket_size && -+ bch2_journal_dev_buckets_available(j, ja, -+ journal_space_discarded)) { -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ /* -+ * ja->bucket_seq[ja->cur_idx] must always have -+ * something sensible: -+ */ -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ } -+ } -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+ -+ if (replicas < replicas_want && target) { -+ /* Retry from all devices: */ -+ target = 0; -+ goto retry; -+ } -+done: -+ rcu_read_unlock(); -+ -+ BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); -+ -+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; -+} -+ -+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -+{ -+ /* we aren't holding j->lock: */ -+ unsigned new_size = READ_ONCE(j->buf_size_want); -+ void *new_buf; -+ -+ if (buf->buf_size >= new_size) -+ return; -+ -+ new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); -+ if (!new_buf) -+ return; -+ -+ memcpy(new_buf, buf->data, buf->buf_size); -+ -+ spin_lock(&j->lock); -+ swap(buf->data, new_buf); -+ swap(buf->buf_size, new_size); -+ spin_unlock(&j->lock); -+ -+ kvpfree(new_buf, new_size); -+} -+ -+static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -+{ -+ return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -+} -+ -+static void journal_write_done(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *w = journal_last_unwritten_buf(j); -+ struct bch_replicas_padded replicas; -+ union journal_res_state old, new; -+ u64 v, seq; -+ int err = 0; -+ -+ bch2_time_stats_update(!JSET_NO_FLUSH(w->data) -+ ? j->flush_write_time -+ : j->noflush_write_time, j->write_start_time); -+ -+ if (!w->devs_written.nr) { -+ bch_err(c, "unable to write journal to sufficient devices"); -+ err = -EIO; -+ } else { -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, -+ w->devs_written); -+ if (bch2_mark_replicas(c, &replicas.e)) -+ err = -EIO; -+ } -+ -+ if (err) -+ bch2_fatal_error(c); -+ -+ spin_lock(&j->lock); -+ seq = le64_to_cpu(w->data->seq); -+ -+ if (seq >= j->pin.front) -+ journal_seq_pin(j, seq)->devs = w->devs_written; -+ -+ if (!err) { -+ if (!JSET_NO_FLUSH(w->data)) { -+ j->flushed_seq_ondisk = seq; -+ j->last_seq_ondisk = w->last_seq; -+ -+ bch2_do_discards(c); -+ closure_wake_up(&c->freelist_wait); -+ -+ bch2_reset_alloc_cursors(c); -+ } -+ } else if (!j->err_seq || seq < j->err_seq) -+ j->err_seq = seq; -+ -+ j->seq_ondisk = seq; -+ -+ /* -+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard -+ * more buckets: -+ * -+ * Must come before signaling write completion, for -+ * bch2_fs_journal_stop(): -+ */ -+ if (j->watermark != BCH_WATERMARK_stripe) -+ journal_reclaim_kick(&c->journal); -+ -+ /* also must come before signalling write completion: */ -+ closure_debug_destroy(cl); -+ -+ v = atomic64_read(&j->reservations.counter); -+ do { -+ old.v = new.v = v; -+ BUG_ON(journal_state_count(new, new.unwritten_idx)); -+ -+ new.unwritten_idx++; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ bch2_journal_space_available(j); -+ -+ closure_wake_up(&w->wait); -+ journal_wake(j); -+ -+ if (!journal_state_count(new, new.unwritten_idx) && -+ journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { -+ closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); -+ } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && -+ new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { -+ struct journal_buf *buf = journal_cur_buf(j); -+ long delta = buf->expires - jiffies; -+ -+ /* -+ * We don't close a journal entry to write it while there's -+ * previous entries still in flight - the current journal entry -+ * might want to be written now: -+ */ -+ -+ mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); -+ } -+ -+ spin_unlock(&j->lock); -+} -+ -+static void journal_write_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ struct journal *j = &ca->fs->journal; -+ struct journal_buf *w = journal_last_unwritten_buf(j); -+ unsigned long flags; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", -+ le64_to_cpu(w->data->seq), -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("journal")) { -+ spin_lock_irqsave(&j->err_lock, flags); -+ bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); -+ spin_unlock_irqrestore(&j->err_lock, flags); -+ } -+ -+ closure_put(&j->io); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static void do_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_last_unwritten_buf(j); -+ struct bch_extent_ptr *ptr; -+ struct bio *bio; -+ unsigned sectors = vstruct_sectors(w->data, c->block_bits); -+ -+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ /* XXX: fix this */ -+ bch_err(c, "missing device for journal write\n"); -+ continue; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], -+ sectors); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); -+ bio->bi_iter.bi_sector = ptr->offset; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ -+ BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); -+ ca->prev_journal_sector = bio->bi_iter.bi_sector; -+ -+ if (!JSET_NO_FLUSH(w->data)) -+ bio->bi_opf |= REQ_FUA; -+ if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) -+ bio->bi_opf |= REQ_PREFLUSH; -+ -+ bch2_bio_map(bio, w->data, sectors << 9); -+ -+ trace_and_count(c, journal_write, bio); -+ closure_bio_submit(bio, cl); -+ -+ ca->journal.bucket_seq[ca->journal.cur_idx] = -+ le64_to_cpu(w->data->seq); -+ } -+ -+ continue_at(cl, journal_write_done, c->io_complete_wq); -+} -+ -+static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) -+{ -+ struct jset_entry *i, *next, *prev = NULL; -+ -+ /* -+ * Simple compaction, dropping empty jset_entries (from journal -+ * reservations that weren't fully used) and merging jset_entries that -+ * can be. -+ * -+ * If we wanted to be really fancy here, we could sort all the keys in -+ * the jset and drop keys that were overwritten - probably not worth it: -+ */ -+ vstruct_for_each_safe(jset, i, next) { -+ unsigned u64s = le16_to_cpu(i->u64s); -+ -+ /* Empty entry: */ -+ if (!u64s) -+ continue; -+ -+ if (i->type == BCH_JSET_ENTRY_btree_root) -+ bch2_journal_entry_to_btree_root(c, i); -+ -+ /* Can we merge with previous entry? */ -+ if (prev && -+ i->btree_id == prev->btree_id && -+ i->level == prev->level && -+ i->type == prev->type && -+ i->type == BCH_JSET_ENTRY_btree_keys && -+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { -+ memmove_u64s_down(vstruct_next(prev), -+ i->_data, -+ u64s); -+ le16_add_cpu(&prev->u64s, u64s); -+ continue; -+ } -+ -+ /* Couldn't merge, move i into new position (after prev): */ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ if (i != prev) -+ memmove_u64s_down(prev, i, jset_u64s(u64s)); -+ } -+ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ -+void bch2_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_last_unwritten_buf(j); -+ struct bch_replicas_padded replicas; -+ struct jset_entry *start, *end; -+ struct jset *jset; -+ struct bio *bio; -+ struct printbuf journal_debug_buf = PRINTBUF; -+ bool validate_before_checksum = false; -+ unsigned i, sectors, bytes, u64s, nr_rw_members = 0; -+ int ret; -+ -+ BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); -+ -+ journal_buf_realloc(j, w); -+ jset = w->data; -+ -+ j->write_start_time = local_clock(); -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * If the journal is in an error state - we did an emergency shutdown - -+ * we prefer to continue doing journal writes. We just mark them as -+ * noflush so they'll never be used, but they'll still be visible by the -+ * list_journal tool - this helps in debugging. -+ * -+ * There's a caveat: the first journal write after marking the -+ * superblock dirty must always be a flush write, because on startup -+ * from a clean shutdown we didn't necessarily read the journal and the -+ * new journal write might overwrite whatever was in the journal -+ * previously - we can't leave the journal without any flush writes in -+ * it. -+ * -+ * So if we're in an error state, and we're still starting up, we don't -+ * write anything at all. -+ */ -+ if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && -+ (bch2_journal_error(j) || -+ w->noflush || -+ (!w->must_flush && -+ (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && -+ test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { -+ w->noflush = true; -+ SET_JSET_NO_FLUSH(jset, true); -+ jset->last_seq = 0; -+ w->last_seq = 0; -+ -+ j->nr_noflush_writes++; -+ } else if (!bch2_journal_error(j)) { -+ j->last_flush_write = jiffies; -+ j->nr_flush_writes++; -+ clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); -+ } else { -+ spin_unlock(&j->lock); -+ goto err; -+ } -+ spin_unlock(&j->lock); -+ -+ /* -+ * New btree roots are set by journalling them; when the journal entry -+ * gets written we have to propagate them to c->btree_roots -+ * -+ * But, every journal entry we write has to contain all the btree roots -+ * (at least for now); so after we copy btree roots to c->btree_roots we -+ * have to get any missing btree roots and add them to this journal -+ * entry: -+ */ -+ -+ bch2_journal_entries_postprocess(c, jset); -+ -+ start = end = vstruct_last(jset); -+ -+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); -+ -+ bch2_journal_super_entries_add_common(c, &end, -+ le64_to_cpu(jset->seq)); -+ u64s = (u64 *) end - (u64 *) start; -+ BUG_ON(u64s > j->entry_u64s_reserved); -+ -+ le32_add_cpu(&jset->u64s, u64s); -+ -+ sectors = vstruct_sectors(jset, c->block_bits); -+ bytes = vstruct_bytes(jset); -+ -+ if (sectors > w->sectors) { -+ bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", -+ vstruct_bytes(jset), w->sectors << 9, -+ u64s, w->u64s_reserved, j->entry_u64s_reserved); -+ goto err; -+ } -+ -+ jset->magic = cpu_to_le64(jset_magic(c)); -+ jset->version = cpu_to_le32(c->sb.version); -+ -+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); -+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); -+ -+ if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) -+ j->last_empty_seq = le64_to_cpu(jset->seq); -+ -+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) -+ validate_before_checksum = true; -+ -+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) -+ validate_before_checksum = true; -+ -+ if (validate_before_checksum && -+ jset_validate(c, NULL, jset, 0, WRITE)) -+ goto err; -+ -+ ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ if (bch2_fs_fatal_err_on(ret, c, -+ "error decrypting journal entry: %i", ret)) -+ goto err; -+ -+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), -+ journal_nonce(jset), jset); -+ -+ if (!validate_before_checksum && -+ jset_validate(c, NULL, jset, 0, WRITE)) -+ goto err; -+ -+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); -+ -+retry_alloc: -+ spin_lock(&j->lock); -+ ret = journal_write_alloc(j, w, sectors); -+ -+ if (ret && j->can_discard) { -+ spin_unlock(&j->lock); -+ bch2_journal_do_discards(j); -+ goto retry_alloc; -+ } -+ -+ if (ret) -+ __bch2_journal_debug_to_text(&journal_debug_buf, j); -+ -+ /* -+ * write is allocated, no longer need to account for it in -+ * bch2_journal_space_available(): -+ */ -+ w->sectors = 0; -+ -+ /* -+ * journal entry has been compacted and allocated, recalculate space -+ * available: -+ */ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ if (ret) { -+ bch_err(c, "Unable to allocate journal write:\n%s", -+ journal_debug_buf.buf); -+ printbuf_exit(&journal_debug_buf); -+ goto err; -+ } -+ -+ w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); -+ -+ if (c->opts.nochanges) -+ goto no_io; -+ -+ for_each_rw_member(ca, c, i) -+ nr_rw_members++; -+ -+ if (nr_rw_members > 1) -+ w->separate_flush = true; -+ -+ /* -+ * Mark journal replicas before we submit the write to guarantee -+ * recovery will find the journal entries after a crash. -+ */ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, -+ w->devs_written); -+ ret = bch2_mark_replicas(c, &replicas.e); -+ if (ret) -+ goto err; -+ -+ if (!JSET_NO_FLUSH(jset) && w->separate_flush) { -+ for_each_rw_member(ca, c, i) { -+ percpu_ref_get(&ca->io_ref); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ closure_bio_submit(bio, cl); -+ } -+ } -+ -+ continue_at(cl, do_journal_write, c->io_complete_wq); -+ return; -+no_io: -+ continue_at(cl, journal_write_done, c->io_complete_wq); -+ return; -+err: -+ bch2_fatal_error(c); -+ continue_at(cl, journal_write_done, c->io_complete_wq); -+} -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -new file mode 100644 -index 000000000..a88d097b1 ---- /dev/null -+++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,65 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_IO_H -+#define _BCACHEFS_JOURNAL_IO_H -+ -+/* -+ * Only used for holding the journal entries we read in btree_journal_read() -+ * during cache_registration -+ */ -+struct journal_replay { -+ struct journal_ptr { -+ bool csum_good; -+ u8 dev; -+ u32 bucket; -+ u32 bucket_offset; -+ u64 sector; -+ } ptrs[BCH_REPLICAS_MAX]; -+ unsigned nr_ptrs; -+ -+ bool csum_good; -+ bool ignore; -+ /* must be last: */ -+ struct jset j; -+}; -+ -+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, -+ struct jset_entry *entry, unsigned type) -+{ -+ while (entry < vstruct_last(jset)) { -+ if (entry->type == type) -+ return entry; -+ -+ entry = vstruct_next(entry); -+ } -+ -+ return NULL; -+} -+ -+#define for_each_jset_entry_type(entry, jset, type) \ -+ for (entry = (jset)->start; \ -+ (entry = __jset_entry_type_next(jset, entry, type)); \ -+ entry = vstruct_next(entry)) -+ -+#define jset_entry_for_each_key(_e, _k) \ -+ for (_k = (_e)->start; \ -+ _k < vstruct_last(_e); \ -+ _k = bkey_next(_k)) -+ -+#define for_each_jset_key(k, entry, jset) \ -+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ -+ jset_entry_for_each_key(entry, k) -+ -+int bch2_journal_entry_validate(struct bch_fs *, struct jset *, -+ struct jset_entry *, unsigned, int, -+ enum bkey_invalid_flags); -+void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, -+ struct jset_entry *); -+ -+void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, -+ struct journal_replay *); -+ -+int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); -+ -+void bch2_journal_write(struct closure *); -+ -+#endif /* _BCACHEFS_JOURNAL_IO_H */ -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -new file mode 100644 -index 000000000..10e1860da ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,874 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "errcode.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+#include "sb-members.h" -+#include "trace.h" -+ -+#include -+#include -+ -+/* Free space calculations: */ -+ -+static unsigned journal_space_from(struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ switch (from) { -+ case journal_space_discarded: -+ return ja->discard_idx; -+ case journal_space_clean_ondisk: -+ return ja->dirty_idx_ondisk; -+ case journal_space_clean: -+ return ja->dirty_idx; -+ default: -+ BUG(); -+ } -+} -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *j, -+ struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ unsigned available = (journal_space_from(ja, from) - -+ ja->cur_idx - 1 + ja->nr) % ja->nr; -+ -+ /* -+ * Don't use the last bucket unless writing the new last_seq -+ * will make another bucket available: -+ */ -+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) -+ --available; -+ -+ return available; -+} -+ -+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) -+{ -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ new.remaining = u64s_remaining; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+} -+ -+static struct journal_space -+journal_dev_space_available(struct journal *j, struct bch_dev *ca, -+ enum journal_space_from from) -+{ -+ struct journal_device *ja = &ca->journal; -+ unsigned sectors, buckets, unwritten; -+ u64 seq; -+ -+ if (from == journal_space_total) -+ return (struct journal_space) { -+ .next_entry = ca->mi.bucket_size, -+ .total = ca->mi.bucket_size * ja->nr, -+ }; -+ -+ buckets = bch2_journal_dev_buckets_available(j, ja, from); -+ sectors = ja->sectors_free; -+ -+ /* -+ * We that we don't allocate the space for a journal entry -+ * until we write it out - thus, account for it here: -+ */ -+ for (seq = journal_last_unwritten_seq(j); -+ seq <= journal_cur_seq(j); -+ seq++) { -+ unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; -+ -+ if (!unwritten) -+ continue; -+ -+ /* entry won't fit on this device, skip: */ -+ if (unwritten > ca->mi.bucket_size) -+ continue; -+ -+ if (unwritten >= sectors) { -+ if (!buckets) { -+ sectors = 0; -+ break; -+ } -+ -+ buckets--; -+ sectors = ca->mi.bucket_size; -+ } -+ -+ sectors -= unwritten; -+ } -+ -+ if (sectors < ca->mi.bucket_size && buckets) { -+ buckets--; -+ sectors = ca->mi.bucket_size; -+ } -+ -+ return (struct journal_space) { -+ .next_entry = sectors, -+ .total = sectors + buckets * ca->mi.bucket_size, -+ }; -+} -+ -+static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, -+ enum journal_space_from from) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned i, pos, nr_devs = 0; -+ struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; -+ -+ BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ if (!ca->journal.nr) -+ continue; -+ -+ space = journal_dev_space_available(j, ca, from); -+ if (!space.next_entry) -+ continue; -+ -+ for (pos = 0; pos < nr_devs; pos++) -+ if (space.total > dev_space[pos].total) -+ break; -+ -+ array_insert_item(dev_space, nr_devs, pos, space); -+ } -+ rcu_read_unlock(); -+ -+ if (nr_devs < nr_devs_want) -+ return (struct journal_space) { 0, 0 }; -+ -+ /* -+ * We sorted largest to smallest, and we want the smallest out of the -+ * @nr_devs_want largest devices: -+ */ -+ return dev_space[nr_devs_want - 1]; -+} -+ -+void bch2_journal_space_available(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned clean, clean_ondisk, total; -+ s64 u64s_remaining = 0; -+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, -+ j->buf[1].buf_size >> 9); -+ unsigned i, nr_online = 0, nr_devs_want; -+ bool can_discard = false; -+ int ret = 0; -+ -+ lockdep_assert_held(&j->lock); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ while (ja->dirty_idx != ja->cur_idx && -+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ -+ while (ja->dirty_idx_ondisk != ja->dirty_idx && -+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ -+ if (ja->discard_idx != ja->dirty_idx_ondisk) -+ can_discard = true; -+ -+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); -+ nr_online++; -+ } -+ rcu_read_unlock(); -+ -+ j->can_discard = can_discard; -+ -+ if (nr_online < c->opts.metadata_replicas_required) { -+ ret = JOURNAL_ERR_insufficient_devices; -+ goto out; -+ } -+ -+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); -+ -+ for (i = 0; i < journal_space_nr; i++) -+ j->space[i] = __journal_space_available(j, nr_devs_want, i); -+ -+ clean_ondisk = j->space[journal_space_clean_ondisk].total; -+ clean = j->space[journal_space_clean].total; -+ total = j->space[journal_space_total].total; -+ -+ if (!j->space[journal_space_discarded].next_entry) -+ ret = JOURNAL_ERR_journal_full; -+ -+ if ((j->space[journal_space_clean_ondisk].next_entry < -+ j->space[journal_space_clean_ondisk].total) && -+ (clean - clean_ondisk <= total / 8) && -+ (clean_ondisk * 2 > clean)) -+ set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); -+ else -+ clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); -+ -+ u64s_remaining = (u64) clean << 6; -+ u64s_remaining -= (u64) total << 3; -+ u64s_remaining = max(0LL, u64s_remaining); -+ u64s_remaining /= 4; -+ u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); -+out: -+ j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; -+ j->cur_entry_error = ret; -+ journal_set_remaining(j, u64s_remaining); -+ journal_set_watermark(j); -+ -+ if (!ret) -+ journal_wake(j); -+} -+ -+/* Discards - last part of journal reclaim: */ -+ -+static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = ja->discard_idx != ja->dirty_idx_ondisk; -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * Advance ja->discard_idx as long as it points to buckets that are no longer -+ * dirty, issuing discards if necessary: -+ */ -+void bch2_journal_do_discards(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ mutex_lock(&j->discard_lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ -+ while (should_discard_bucket(j, ja)) { -+ if (!c->opts.nochanges && -+ ca->mi.discard && -+ bdev_max_discard_sectors(ca->disk_sb.bdev)) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, -+ ja->buckets[ja->discard_idx]), -+ ca->mi.bucket_size, GFP_NOFS); -+ -+ spin_lock(&j->lock); -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ } -+ } -+ -+ mutex_unlock(&j->discard_lock); -+} -+ -+/* -+ * Journal entry pinning - machinery for holding a reference on a given journal -+ * entry, holding it open to ensure it gets replayed during recovery: -+ */ -+ -+static void bch2_journal_reclaim_fast(struct journal *j) -+{ -+ struct journal_entry_pin_list temp; -+ bool popped = false; -+ -+ lockdep_assert_held(&j->lock); -+ -+ /* -+ * Unpin journal entries whose reference counts reached zero, meaning -+ * all btree nodes got written out -+ */ -+ while (!fifo_empty(&j->pin) && -+ !atomic_read(&fifo_peek_front(&j->pin).count)) { -+ fifo_pop(&j->pin, temp); -+ popped = true; -+ } -+ -+ if (popped) -+ bch2_journal_space_available(j); -+} -+ -+void __bch2_journal_pin_put(struct journal *j, u64 seq) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ if (atomic_dec_and_test(&pin_list->count)) -+ bch2_journal_reclaim_fast(j); -+} -+ -+void bch2_journal_pin_put(struct journal *j, u64 seq) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ if (atomic_dec_and_test(&pin_list->count)) { -+ spin_lock(&j->lock); -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+static inline bool __journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ struct journal_entry_pin_list *pin_list; -+ -+ if (!journal_pin_active(pin)) -+ return false; -+ -+ if (j->flush_in_progress == pin) -+ j->flush_in_progress_dropped = true; -+ -+ pin_list = journal_seq_pin(j, pin->seq); -+ pin->seq = 0; -+ list_del_init(&pin->list); -+ -+ /* -+ * Unpinning a journal entry may make journal_next_bucket() succeed, if -+ * writing a new last_seq will now make another bucket available: -+ */ -+ return atomic_dec_and_test(&pin_list->count) && -+ pin_list == &fifo_peek_front(&j->pin); -+} -+ -+void bch2_journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ spin_lock(&j->lock); -+ if (__journal_pin_drop(j, pin)) -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+} -+ -+static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) -+{ -+ if (fn == bch2_btree_node_flush0 || -+ fn == bch2_btree_node_flush1) -+ return JOURNAL_PIN_btree; -+ else if (fn == bch2_btree_key_cache_journal_flush) -+ return JOURNAL_PIN_key_cache; -+ else -+ return JOURNAL_PIN_other; -+} -+ -+void bch2_journal_pin_set(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ struct journal_entry_pin_list *pin_list; -+ bool reclaim; -+ -+ spin_lock(&j->lock); -+ -+ if (seq < journal_last_seq(j)) { -+ /* -+ * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on -+ * the src pin - with the pin dropped, the entry to pin might no -+ * longer to exist, but that means there's no longer anything to -+ * copy and we can bail out here: -+ */ -+ spin_unlock(&j->lock); -+ return; -+ } -+ -+ pin_list = journal_seq_pin(j, seq); -+ -+ reclaim = __journal_pin_drop(j, pin); -+ -+ atomic_inc(&pin_list->count); -+ pin->seq = seq; -+ pin->flush = flush_fn; -+ -+ if (flush_fn) -+ list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); -+ else -+ list_add(&pin->list, &pin_list->flushed); -+ -+ if (reclaim) -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+/** -+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running -+ */ -+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -+{ -+ BUG_ON(journal_pin_active(pin)); -+ -+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -+} -+ -+/* -+ * Journal reclaim: flush references to open journal entries to reclaim space in -+ * the journal -+ * -+ * May be done by the journal code in the background as needed to free up space -+ * for more journal entries, or as part of doing a clean shutdown, or to migrate -+ * data off of a specific device: -+ */ -+ -+static struct journal_entry_pin * -+journal_get_next_pin(struct journal *j, -+ u64 seq_to_flush, -+ unsigned allowed_below_seq, -+ unsigned allowed_above_seq, -+ u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *ret = NULL; -+ unsigned i; -+ -+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { -+ if (*seq > seq_to_flush && !allowed_above_seq) -+ break; -+ -+ for (i = 0; i < JOURNAL_PIN_NR; i++) -+ if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || -+ ((1U << i) & allowed_above_seq)) { -+ ret = list_first_entry_or_null(&pin_list->list[i], -+ struct journal_entry_pin, list); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ return NULL; -+} -+ -+/* returns true if we did work */ -+static size_t journal_flush_pins(struct journal *j, -+ u64 seq_to_flush, -+ unsigned allowed_below_seq, -+ unsigned allowed_above_seq, -+ unsigned min_any, -+ unsigned min_key_cache) -+{ -+ struct journal_entry_pin *pin; -+ size_t nr_flushed = 0; -+ journal_pin_flush_fn flush_fn; -+ u64 seq; -+ int err; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ while (1) { -+ unsigned allowed_above = allowed_above_seq; -+ unsigned allowed_below = allowed_below_seq; -+ -+ if (min_any) { -+ allowed_above |= ~0; -+ allowed_below |= ~0; -+ } -+ -+ if (min_key_cache) { -+ allowed_above |= 1U << JOURNAL_PIN_key_cache; -+ allowed_below |= 1U << JOURNAL_PIN_key_cache; -+ } -+ -+ cond_resched(); -+ -+ j->last_flushed = jiffies; -+ -+ spin_lock(&j->lock); -+ pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); -+ if (pin) { -+ BUG_ON(j->flush_in_progress); -+ j->flush_in_progress = pin; -+ j->flush_in_progress_dropped = false; -+ flush_fn = pin->flush; -+ } -+ spin_unlock(&j->lock); -+ -+ if (!pin) -+ break; -+ -+ if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) -+ min_key_cache--; -+ -+ if (min_any) -+ min_any--; -+ -+ err = flush_fn(j, pin, seq); -+ -+ spin_lock(&j->lock); -+ /* Pin might have been dropped or rearmed: */ -+ if (likely(!err && !j->flush_in_progress_dropped)) -+ list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); -+ j->flush_in_progress = NULL; -+ j->flush_in_progress_dropped = false; -+ spin_unlock(&j->lock); -+ -+ wake_up(&j->pin_flush_wait); -+ -+ if (err) -+ break; -+ -+ nr_flushed++; -+ } -+ -+ return nr_flushed; -+} -+ -+static u64 journal_seq_to_flush(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ u64 seq_to_flush = 0; -+ unsigned iter; -+ -+ spin_lock(&j->lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ unsigned nr_buckets, bucket_to_flush; -+ -+ if (!ja->nr) -+ continue; -+ -+ /* Try to keep the journal at most half full: */ -+ nr_buckets = ja->nr / 2; -+ -+ /* And include pre-reservations: */ -+ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, -+ (ca->mi.bucket_size << 6) - -+ journal_entry_overhead(j)); -+ -+ nr_buckets = min(nr_buckets, ja->nr); -+ -+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; -+ seq_to_flush = max(seq_to_flush, -+ ja->bucket_seq[bucket_to_flush]); -+ } -+ -+ /* Also flush if the pin fifo is more than half full */ -+ seq_to_flush = max_t(s64, seq_to_flush, -+ (s64) journal_cur_seq(j) - -+ (j->pin.size >> 1)); -+ spin_unlock(&j->lock); -+ -+ return seq_to_flush; -+} -+ -+/** -+ * bch2_journal_reclaim - free up journal buckets -+ * -+ * Background journal reclaim writes out btree nodes. It should be run -+ * early enough so that we never completely run out of journal buckets. -+ * -+ * High watermarks for triggering background reclaim: -+ * - FIFO has fewer than 512 entries left -+ * - fewer than 25% journal buckets free -+ * -+ * Background reclaim runs until low watermarks are reached: -+ * - FIFO has more than 1024 entries left -+ * - more than 50% journal buckets free -+ * -+ * As long as a reclaim can complete in the time it takes to fill up -+ * 512 journal entries or 25% of all journal buckets, then -+ * journal_next_bucket() should not stall. -+ */ -+static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ u64 seq_to_flush; -+ size_t min_nr, min_key_cache, nr_flushed; -+ unsigned flags; -+ int ret = 0; -+ -+ /* -+ * We can't invoke memory reclaim while holding the reclaim_lock - -+ * journal reclaim is required to make progress for memory reclaim -+ * (cleaning the caches), so we can't get stuck in memory reclaim while -+ * we're holding the reclaim lock: -+ */ -+ lockdep_assert_held(&j->reclaim_lock); -+ flags = memalloc_noreclaim_save(); -+ -+ do { -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if (bch2_journal_error(j)) { -+ ret = -EIO; -+ break; -+ } -+ -+ bch2_journal_do_discards(j); -+ -+ seq_to_flush = journal_seq_to_flush(j); -+ min_nr = 0; -+ -+ /* -+ * If it's been longer than j->reclaim_delay_ms since we last flushed, -+ * make sure to flush at least one journal pin: -+ */ -+ if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(c->opts.journal_reclaim_delay))) -+ min_nr = 1; -+ -+ if (j->prereserved.reserved * 4 > j->prereserved.remaining) -+ min_nr = 1; -+ -+ if (fifo_free(&j->pin) <= 32) -+ min_nr = 1; -+ -+ if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) -+ min_nr = 1; -+ -+ min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); -+ -+ trace_and_count(c, journal_reclaim_start, c, -+ direct, kicked, -+ min_nr, min_key_cache, -+ j->prereserved.reserved, -+ j->prereserved.remaining, -+ atomic_read(&c->btree_cache.dirty), -+ c->btree_cache.used, -+ atomic_long_read(&c->btree_key_cache.nr_dirty), -+ atomic_long_read(&c->btree_key_cache.nr_keys)); -+ -+ nr_flushed = journal_flush_pins(j, seq_to_flush, -+ ~0, 0, -+ min_nr, min_key_cache); -+ -+ if (direct) -+ j->nr_direct_reclaim += nr_flushed; -+ else -+ j->nr_background_reclaim += nr_flushed; -+ trace_and_count(c, journal_reclaim_finish, c, nr_flushed); -+ -+ if (nr_flushed) -+ wake_up(&j->reclaim_wait); -+ } while ((min_nr || min_key_cache) && nr_flushed && !direct); -+ -+ memalloc_noreclaim_restore(flags); -+ -+ return ret; -+} -+ -+int bch2_journal_reclaim(struct journal *j) -+{ -+ return __bch2_journal_reclaim(j, true, true); -+} -+ -+static int bch2_journal_reclaim_thread(void *arg) -+{ -+ struct journal *j = arg; -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ unsigned long delay, now; -+ bool journal_empty; -+ int ret = 0; -+ -+ set_freezable(); -+ -+ j->last_flushed = jiffies; -+ -+ while (!ret && !kthread_should_stop()) { -+ bool kicked = j->reclaim_kicked; -+ -+ j->reclaim_kicked = false; -+ -+ mutex_lock(&j->reclaim_lock); -+ ret = __bch2_journal_reclaim(j, false, kicked); -+ mutex_unlock(&j->reclaim_lock); -+ -+ now = jiffies; -+ delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); -+ j->next_reclaim = j->last_flushed + delay; -+ -+ if (!time_in_range(j->next_reclaim, now, now + delay)) -+ j->next_reclaim = now + delay; -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); -+ if (kthread_should_stop()) -+ break; -+ if (j->reclaim_kicked) -+ break; -+ -+ spin_lock(&j->lock); -+ journal_empty = fifo_empty(&j->pin); -+ spin_unlock(&j->lock); -+ -+ if (journal_empty) -+ schedule(); -+ else if (time_after(j->next_reclaim, jiffies)) -+ schedule_timeout(j->next_reclaim - jiffies); -+ else -+ break; -+ } -+ __set_current_state(TASK_RUNNING); -+ } -+ -+ return 0; -+} -+ -+void bch2_journal_reclaim_stop(struct journal *j) -+{ -+ struct task_struct *p = j->reclaim_thread; -+ -+ j->reclaim_thread = NULL; -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_journal_reclaim_start(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct task_struct *p; -+ int ret; -+ -+ if (j->reclaim_thread) -+ return 0; -+ -+ p = kthread_create(bch2_journal_reclaim_thread, j, -+ "bch-reclaim/%s", c->name); -+ ret = PTR_ERR_OR_ZERO(p); -+ if (ret) { -+ bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ get_task_struct(p); -+ j->reclaim_thread = p; -+ wake_up_process(p); -+ return 0; -+} -+ -+static int journal_flush_done(struct journal *j, u64 seq_to_flush, -+ bool *did_work) -+{ -+ int ret; -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&j->reclaim_lock); -+ -+ if (journal_flush_pins(j, seq_to_flush, -+ (1U << JOURNAL_PIN_key_cache)| -+ (1U << JOURNAL_PIN_other), 0, 0, 0) || -+ journal_flush_pins(j, seq_to_flush, -+ (1U << JOURNAL_PIN_btree), 0, 0, 0)) -+ *did_work = true; -+ -+ spin_lock(&j->lock); -+ /* -+ * If journal replay hasn't completed, the unreplayed journal entries -+ * hold refs on their corresponding sequence numbers -+ */ -+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || -+ journal_last_seq(j) > seq_to_flush || -+ !fifo_used(&j->pin); -+ -+ spin_unlock(&j->lock); -+ mutex_unlock(&j->reclaim_lock); -+ -+ return ret; -+} -+ -+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -+{ -+ bool did_work = false; -+ -+ if (!test_bit(JOURNAL_STARTED, &j->flags)) -+ return false; -+ -+ closure_wait_event(&j->async_wait, -+ journal_flush_done(j, seq_to_flush, &did_work)); -+ -+ return did_work; -+} -+ -+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ u64 iter, seq = 0; -+ int ret = 0; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(p, &j->pin, iter) -+ if (dev_idx >= 0 -+ ? bch2_dev_list_has_dev(p->devs, dev_idx) -+ : p->devs.nr < c->opts.metadata_replicas) -+ seq = iter; -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_pins(j, seq); -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->replicas_gc_lock); -+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); -+ -+ /* -+ * Now that we've populated replicas_gc, write to the journal to mark -+ * active journal devices. This handles the case where the journal might -+ * be empty. Otherwise we could clear all journal replicas and -+ * temporarily put the fs into an unrecoverable state. Journal recovery -+ * expects to find devices marked for journal data on unclean mount. -+ */ -+ ret = bch2_journal_meta(&c->journal); -+ if (ret) -+ goto err; -+ -+ seq = 0; -+ spin_lock(&j->lock); -+ while (!ret) { -+ struct bch_replicas_padded replicas; -+ -+ seq = max(seq, journal_last_seq(j)); -+ if (seq >= j->pin.back) -+ break; -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, -+ journal_seq_pin(j, seq)->devs); -+ seq++; -+ -+ spin_unlock(&j->lock); -+ ret = bch2_mark_replicas(c, &replicas.e); -+ spin_lock(&j->lock); -+ } -+ spin_unlock(&j->lock); -+err: -+ ret = bch2_replicas_gc_end(c, ret); -+ mutex_unlock(&c->replicas_gc_lock); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h -new file mode 100644 -index 000000000..0fd1af120 ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.h -@@ -0,0 +1,86 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -+#define _BCACHEFS_JOURNAL_RECLAIM_H -+ -+#define JOURNAL_PIN (32 * 1024) -+ -+static inline void journal_reclaim_kick(struct journal *j) -+{ -+ struct task_struct *p = READ_ONCE(j->reclaim_thread); -+ -+ j->reclaim_kicked = true; -+ if (p) -+ wake_up_process(p); -+} -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *, -+ struct journal_device *, -+ enum journal_space_from); -+void bch2_journal_space_available(struct journal *); -+ -+static inline bool journal_pin_active(struct journal_entry_pin *pin) -+{ -+ return pin->seq != 0; -+} -+ -+static inline struct journal_entry_pin_list * -+journal_seq_pin(struct journal *j, u64 seq) -+{ -+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); -+ -+ return &j->pin.data[seq & j->pin.mask]; -+} -+ -+void __bch2_journal_pin_put(struct journal *, u64); -+void bch2_journal_pin_put(struct journal *, u64); -+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -+ -+void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+static inline void bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) -+ bch2_journal_pin_set(j, seq, pin, flush_fn); -+} -+ -+static inline void bch2_journal_pin_copy(struct journal *j, -+ struct journal_entry_pin *dst, -+ struct journal_entry_pin *src, -+ journal_pin_flush_fn flush_fn) -+{ -+ /* Guard against racing with journal_pin_drop(src): */ -+ u64 seq = READ_ONCE(src->seq); -+ -+ if (seq) -+ bch2_journal_pin_add(j, seq, dst, flush_fn); -+} -+ -+static inline void bch2_journal_pin_update(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) -+ bch2_journal_pin_set(j, seq, pin, flush_fn); -+} -+ -+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -+ -+void bch2_journal_do_discards(struct journal *); -+int bch2_journal_reclaim(struct journal *); -+ -+void bch2_journal_reclaim_stop(struct journal *); -+int bch2_journal_reclaim_start(struct journal *); -+ -+bool bch2_journal_flush_pins(struct journal *, u64); -+ -+static inline bool bch2_journal_flush_all_pins(struct journal *j) -+{ -+ return bch2_journal_flush_pins(j, U64_MAX); -+} -+ -+int bch2_journal_flush_device_pins(struct journal *, int); -+ -+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ -diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c -new file mode 100644 -index 000000000..cc41bff86 ---- /dev/null -+++ b/fs/bcachefs/journal_sb.c -@@ -0,0 +1,219 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "journal_sb.h" -+#include "darray.h" -+ -+#include -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static int u64_cmp(const void *_l, const void *_r) -+{ -+ const u64 *l = _l; -+ const u64 *r = _r; -+ -+ return cmp_int(*l, *r); -+} -+ -+static int bch2_sb_journal_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ int ret = -BCH_ERR_invalid_sb_journal; -+ unsigned nr; -+ unsigned i; -+ u64 *b; -+ -+ nr = bch2_nr_journal_buckets(journal); -+ if (!nr) -+ return 0; -+ -+ b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); -+ if (!b) -+ return -BCH_ERR_ENOMEM_sb_journal_validate; -+ -+ for (i = 0; i < nr; i++) -+ b[i] = le64_to_cpu(journal->buckets[i]); -+ -+ sort(b, nr, sizeof(u64), u64_cmp, NULL); -+ -+ if (!b[0]) { -+ prt_printf(err, "journal bucket at sector 0"); -+ goto err; -+ } -+ -+ if (b[0] < le16_to_cpu(m->first_bucket)) { -+ prt_printf(err, "journal bucket %llu before first bucket %u", -+ b[0], le16_to_cpu(m->first_bucket)); -+ goto err; -+ } -+ -+ if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { -+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", -+ b[nr - 1], le64_to_cpu(m->nbuckets)); -+ goto err; -+ } -+ -+ for (i = 0; i + 1 < nr; i++) -+ if (b[i] == b[i + 1]) { -+ prt_printf(err, "duplicate journal buckets %llu", b[i]); -+ goto err; -+ } -+ -+ ret = 0; -+err: -+ kfree(b); -+ return ret; -+} -+ -+static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ unsigned i, nr = bch2_nr_journal_buckets(journal); -+ -+ prt_printf(out, "Buckets: "); -+ for (i = 0; i < nr; i++) -+ prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal = { -+ .validate = bch2_sb_journal_validate, -+ .to_text = bch2_sb_journal_to_text, -+}; -+ -+struct u64_range { -+ u64 start; -+ u64 end; -+}; -+ -+static int u64_range_cmp(const void *_l, const void *_r) -+{ -+ const struct u64_range *l = _l; -+ const struct u64_range *r = _r; -+ -+ return cmp_int(l->start, r->start); -+} -+ -+static int bch2_sb_journal_v2_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ int ret = -BCH_ERR_invalid_sb_journal; -+ unsigned nr; -+ unsigned i; -+ struct u64_range *b; -+ -+ nr = bch2_sb_field_journal_v2_nr_entries(journal); -+ if (!nr) -+ return 0; -+ -+ b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); -+ if (!b) -+ return -BCH_ERR_ENOMEM_sb_journal_v2_validate; -+ -+ for (i = 0; i < nr; i++) { -+ b[i].start = le64_to_cpu(journal->d[i].start); -+ b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); -+ } -+ -+ sort(b, nr, sizeof(*b), u64_range_cmp, NULL); -+ -+ if (!b[0].start) { -+ prt_printf(err, "journal bucket at sector 0"); -+ goto err; -+ } -+ -+ if (b[0].start < le16_to_cpu(m->first_bucket)) { -+ prt_printf(err, "journal bucket %llu before first bucket %u", -+ b[0].start, le16_to_cpu(m->first_bucket)); -+ goto err; -+ } -+ -+ if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { -+ prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", -+ b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); -+ goto err; -+ } -+ -+ for (i = 0; i + 1 < nr; i++) { -+ if (b[i].end > b[i + 1].start) { -+ prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", -+ b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); -+ goto err; -+ } -+ } -+ -+ ret = 0; -+err: -+ kfree(b); -+ return ret; -+} -+ -+static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); -+ unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); -+ -+ prt_printf(out, "Buckets: "); -+ for (i = 0; i < nr; i++) -+ prt_printf(out, " %llu-%llu", -+ le64_to_cpu(journal->d[i].start), -+ le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { -+ .validate = bch2_sb_journal_v2_validate, -+ .to_text = bch2_sb_journal_v2_to_text, -+}; -+ -+int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, -+ u64 *buckets, unsigned nr) -+{ -+ struct bch_sb_field_journal_v2 *j; -+ unsigned i, dst = 0, nr_compacted = 1; -+ -+ if (c) -+ lockdep_assert_held(&c->sb_lock); -+ -+ if (!nr) { -+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); -+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); -+ return 0; -+ } -+ -+ for (i = 0; i + 1 < nr; i++) -+ if (buckets[i] + 1 != buckets[i + 1]) -+ nr_compacted++; -+ -+ j = bch2_sb_resize_journal_v2(&ca->disk_sb, -+ (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); -+ if (!j) -+ return -BCH_ERR_ENOSPC_sb_journal; -+ -+ bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); -+ -+ j->d[dst].start = cpu_to_le64(buckets[0]); -+ j->d[dst].nr = cpu_to_le64(1); -+ -+ for (i = 1; i < nr; i++) { -+ if (buckets[i] == buckets[i - 1] + 1) { -+ le64_add_cpu(&j->d[dst].nr, 1); -+ } else { -+ dst++; -+ j->d[dst].start = cpu_to_le64(buckets[i]); -+ j->d[dst].nr = cpu_to_le64(1); -+ } -+ } -+ -+ BUG_ON(dst + 1 != nr_compacted); -+ return 0; -+} -diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h -new file mode 100644 -index 000000000..ba40a7e8d ---- /dev/null -+++ b/fs/bcachefs/journal_sb.h -@@ -0,0 +1,24 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#include "super-io.h" -+#include "vstructs.h" -+ -+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -+{ -+ return j -+ ? (__le64 *) vstruct_end(&j->field) - j->buckets -+ : 0; -+} -+ -+static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) -+{ -+ if (!j) -+ return 0; -+ -+ return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; -+} -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal; -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; -+ -+int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -new file mode 100644 -index 000000000..d6b9f2cdf ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -0,0 +1,322 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_iter.h" -+#include "eytzinger.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+/* -+ * journal_seq_blacklist machinery: -+ * -+ * To guarantee order of btree updates after a crash, we need to detect when a -+ * btree node entry (bset) is newer than the newest journal entry that was -+ * successfully written, and ignore it - effectively ignoring any btree updates -+ * that didn't make it into the journal. -+ * -+ * If we didn't do this, we might have two btree nodes, a and b, both with -+ * updates that weren't written to the journal yet: if b was updated after a, -+ * but b was flushed and not a - oops; on recovery we'll find that the updates -+ * to b happened, but not the updates to a that happened before it. -+ * -+ * Ignoring bsets that are newer than the newest journal entry is always safe, -+ * because everything they contain will also have been journalled - and must -+ * still be present in the journal on disk until a journal entry has been -+ * written _after_ that bset was written. -+ * -+ * To accomplish this, bsets record the newest journal sequence number they -+ * contain updates for; then, on startup, the btree code queries the journal -+ * code to ask "Is this sequence number newer than the newest journal entry? If -+ * so, ignore it." -+ * -+ * When this happens, we must blacklist that journal sequence number: the -+ * journal must not write any entries with that sequence number, and it must -+ * record that it was blacklisted so that a) on recovery we don't think we have -+ * missing journal entries and b) so that the btree code continues to ignore -+ * that bset, until that btree node is rewritten. -+ */ -+ -+static unsigned sb_blacklist_u64s(unsigned nr) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ -+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -+} -+ -+static struct bch_sb_field_journal_seq_blacklist * -+blacklist_entry_try_merge(struct bch_fs *c, -+ struct bch_sb_field_journal_seq_blacklist *bl, -+ unsigned i) -+{ -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ if (le64_to_cpu(bl->start[i].end) >= -+ le64_to_cpu(bl->start[i + 1].start)) { -+ bl->start[i].end = bl->start[i + 1].end; -+ --nr; -+ memmove(&bl->start[i], -+ &bl->start[i + 1], -+ sizeof(bl->start[0]) * (nr - i)); -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr)); -+ BUG_ON(!bl); -+ } -+ -+ return bl; -+} -+ -+static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, -+ u64 start, u64 end) -+{ -+ return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -+} -+ -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ unsigned i, nr; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ nr = blacklist_nr_entries(bl); -+ -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = -+ bl->start + i; -+ -+ if (bl_entry_contig_or_overlaps(e, start, end)) { -+ e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); -+ e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); -+ -+ if (i + 1 < nr) -+ bl = blacklist_entry_try_merge(c, -+ bl, i); -+ if (i) -+ bl = blacklist_entry_try_merge(c, -+ bl, i - 1); -+ goto out_write_sb; -+ } -+ } -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr + 1)); -+ if (!bl) { -+ ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; -+ goto out; -+ } -+ -+ bl->start[nr].start = cpu_to_le64(start); -+ bl->start[nr].end = cpu_to_le64(end); -+out_write_sb: -+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); -+ -+ ret = bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret ?: bch2_blacklist_table_initialize(c); -+} -+ -+static int journal_seq_blacklist_table_cmp(const void *_l, -+ const void *_r, size_t size) -+{ -+ const struct journal_seq_blacklist_table_entry *l = _l; -+ const struct journal_seq_blacklist_table_entry *r = _r; -+ -+ return cmp_int(l->start, r->start); -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, -+ bool dirty) -+{ -+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; -+ struct journal_seq_blacklist_table_entry search = { .start = seq }; -+ int idx; -+ -+ if (!t) -+ return false; -+ -+ idx = eytzinger0_find_le(t->entries, t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ &search); -+ if (idx < 0) -+ return false; -+ -+ BUG_ON(t->entries[idx].start > seq); -+ -+ if (seq >= t->entries[idx].end) -+ return false; -+ -+ if (dirty) -+ t->entries[idx].dirty = true; -+ return true; -+} -+ -+int bch2_blacklist_table_initialize(struct bch_fs *c) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ struct journal_seq_blacklist_table *t; -+ unsigned i, nr = blacklist_nr_entries(bl); -+ -+ if (!bl) -+ return 0; -+ -+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, -+ GFP_KERNEL); -+ if (!t) -+ return -BCH_ERR_ENOMEM_blacklist_table_init; -+ -+ t->nr = nr; -+ -+ for (i = 0; i < nr; i++) { -+ t->entries[i].start = le64_to_cpu(bl->start[i].start); -+ t->entries[i].end = le64_to_cpu(bl->start[i].end); -+ } -+ -+ eytzinger0_sort(t->entries, -+ t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ NULL); -+ -+ kfree(c->journal_seq_blacklist_table); -+ c->journal_seq_blacklist_table = t; -+ return 0; -+} -+ -+static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ unsigned i, nr = blacklist_nr_entries(bl); -+ -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = bl->start + i; -+ -+ if (le64_to_cpu(e->start) >= -+ le64_to_cpu(e->end)) { -+ prt_printf(err, "entry %u start >= end (%llu >= %llu)", -+ i, le64_to_cpu(e->start), le64_to_cpu(e->end)); -+ return -BCH_ERR_invalid_sb_journal_seq_blacklist; -+ } -+ -+ if (i + 1 < nr && -+ le64_to_cpu(e[0].end) > -+ le64_to_cpu(e[1].start)) { -+ prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", -+ i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); -+ return -BCH_ERR_invalid_sb_journal_seq_blacklist; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (i != bl->start) -+ prt_printf(out, " "); -+ -+ prt_printf(out, "%llu-%llu", -+ le64_to_cpu(i->start), -+ le64_to_cpu(i->end)); -+ } -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { -+ .validate = bch2_sb_journal_seq_blacklist_validate, -+ .to_text = bch2_sb_journal_seq_blacklist_to_text -+}; -+ -+void bch2_blacklist_entries_gc(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ journal_seq_blacklist_gc_work); -+ struct journal_seq_blacklist_table *t; -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ struct journal_seq_blacklist_entry *src, *dst; -+ struct btree_trans trans; -+ unsigned i, nr, new_nr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_iter iter; -+ struct btree *b; -+ -+ bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, -+ 0, 0, BTREE_ITER_PREFETCH); -+retry: -+ bch2_trans_begin(&trans); -+ -+ b = bch2_btree_iter_peek_node(&iter); -+ -+ while (!(ret = PTR_ERR_OR_ZERO(b)) && -+ b && -+ !test_bit(BCH_FS_STOPPING, &c->flags)) -+ b = bch2_btree_iter_next_node(&iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ } -+ -+ bch2_trans_exit(&trans); -+ if (ret) -+ return; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ if (!bl) -+ goto out; -+ -+ nr = blacklist_nr_entries(bl); -+ dst = bl->start; -+ -+ t = c->journal_seq_blacklist_table; -+ BUG_ON(nr != t->nr); -+ -+ for (src = bl->start, i = eytzinger0_first(t->nr); -+ src < bl->start + nr; -+ src++, i = eytzinger0_next(i, nr)) { -+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); -+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); -+ -+ if (t->entries[i].dirty) -+ *dst++ = *src; -+ } -+ -+ new_nr = dst - bl->start; -+ -+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); -+ -+ if (new_nr != nr) { -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ new_nr ? sb_blacklist_u64s(new_nr) : 0); -+ BUG_ON(new_nr && !bl); -+ -+ if (!new_nr) -+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); -+ -+ bch2_write_super(c); -+ } -+out: -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h -new file mode 100644 -index 000000000..afb886ec8 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.h -@@ -0,0 +1,22 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+ -+static inline unsigned -+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -+{ -+ return bl -+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / -+ sizeof(struct journal_seq_blacklist_entry)) -+ : 0; -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -+int bch2_blacklist_table_initialize(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -+ -+void bch2_blacklist_entries_gc(struct work_struct *); -+ -+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -new file mode 100644 -index 000000000..42504e16a ---- /dev/null -+++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,345 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_TYPES_H -+#define _BCACHEFS_JOURNAL_TYPES_H -+ -+#include -+#include -+ -+#include "alloc_types.h" -+#include "super_types.h" -+#include "fifo.h" -+ -+#define JOURNAL_BUF_BITS 2 -+#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) -+#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) -+ -+/* -+ * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to -+ * the journal that are being staged or in flight. -+ */ -+struct journal_buf { -+ struct jset *data; -+ -+ __BKEY_PADDED(key, BCH_REPLICAS_MAX); -+ struct bch_devs_list devs_written; -+ -+ struct closure_waitlist wait; -+ u64 last_seq; /* copy of data->last_seq */ -+ long expires; -+ u64 flush_time; -+ -+ unsigned buf_size; /* size in bytes of @data */ -+ unsigned sectors; /* maximum size for current entry */ -+ unsigned disk_sectors; /* maximum size entry could have been, if -+ buf_size was bigger */ -+ unsigned u64s_reserved; -+ bool noflush; /* write has already been kicked off, and was noflush */ -+ bool must_flush; /* something wants a flush */ -+ bool separate_flush; -+}; -+ -+/* -+ * Something that makes a journal entry dirty - i.e. a btree node that has to be -+ * flushed: -+ */ -+ -+enum journal_pin_type { -+ JOURNAL_PIN_btree, -+ JOURNAL_PIN_key_cache, -+ JOURNAL_PIN_other, -+ JOURNAL_PIN_NR, -+}; -+ -+struct journal_entry_pin_list { -+ struct list_head list[JOURNAL_PIN_NR]; -+ struct list_head flushed; -+ atomic_t count; -+ struct bch_devs_list devs; -+}; -+ -+struct journal; -+struct journal_entry_pin; -+typedef int (*journal_pin_flush_fn)(struct journal *j, -+ struct journal_entry_pin *, u64); -+ -+struct journal_entry_pin { -+ struct list_head list; -+ journal_pin_flush_fn flush; -+ u64 seq; -+}; -+ -+struct journal_res { -+ bool ref; -+ u8 idx; -+ u16 u64s; -+ u32 offset; -+ u64 seq; -+}; -+ -+/* -+ * For reserving space in the journal prior to getting a reservation on a -+ * particular journal entry: -+ */ -+struct journal_preres { -+ unsigned u64s; -+}; -+ -+union journal_res_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 cur_entry_offset:20, -+ idx:2, -+ unwritten_idx:2, -+ buf0_count:10, -+ buf1_count:10, -+ buf2_count:10, -+ buf3_count:10; -+ }; -+}; -+ -+union journal_preres_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 waiting:1, -+ reserved:31, -+ remaining:32; -+ }; -+}; -+ -+/* bytes: */ -+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+ -+/* -+ * We stash some journal state as sentinal values in cur_entry_offset: -+ * note - cur_entry_offset is in units of u64s -+ */ -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+ -+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) -+ -+struct journal_space { -+ /* Units of 512 bytes sectors: */ -+ unsigned next_entry; /* How big the next journal entry can be */ -+ unsigned total; -+}; -+ -+enum journal_space_from { -+ journal_space_discarded, -+ journal_space_clean_ondisk, -+ journal_space_clean, -+ journal_space_total, -+ journal_space_nr, -+}; -+ -+enum journal_flags { -+ JOURNAL_REPLAY_DONE, -+ JOURNAL_STARTED, -+ JOURNAL_MAY_SKIP_FLUSH, -+ JOURNAL_NEED_FLUSH_WRITE, -+}; -+ -+/* Reasons we may fail to get a journal reservation: */ -+#define JOURNAL_ERRORS() \ -+ x(ok) \ -+ x(blocked) \ -+ x(max_in_flight) \ -+ x(journal_full) \ -+ x(journal_pin_full) \ -+ x(journal_stuck) \ -+ x(insufficient_devices) -+ -+enum journal_errors { -+#define x(n) JOURNAL_ERR_##n, -+ JOURNAL_ERRORS() -+#undef x -+}; -+ -+typedef DARRAY(u64) darray_u64; -+ -+/* Embedded in struct bch_fs */ -+struct journal { -+ /* Fastpath stuff up front: */ -+ struct { -+ -+ union journal_res_state reservations; -+ enum bch_watermark watermark; -+ -+ union journal_preres_state prereserved; -+ -+ } __aligned(SMP_CACHE_BYTES); -+ -+ unsigned long flags; -+ -+ /* Max size of current journal entry */ -+ unsigned cur_entry_u64s; -+ unsigned cur_entry_sectors; -+ -+ /* Reserved space in journal entry to be used just prior to write */ -+ unsigned entry_u64s_reserved; -+ -+ -+ /* -+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if -+ * insufficient devices: -+ */ -+ enum journal_errors cur_entry_error; -+ -+ unsigned buf_size_want; -+ /* -+ * We may queue up some things to be journalled (log messages) before -+ * the journal has actually started - stash them here: -+ */ -+ darray_u64 early_journal_entries; -+ -+ /* -+ * Two journal entries -- one is currently open for new entries, the -+ * other is possibly being written out. -+ */ -+ struct journal_buf buf[JOURNAL_BUF_NR]; -+ -+ spinlock_t lock; -+ -+ /* if nonzero, we may not open a new journal entry: */ -+ unsigned blocked; -+ -+ /* Used when waiting because the journal was full */ -+ wait_queue_head_t wait; -+ struct closure_waitlist async_wait; -+ struct closure_waitlist preres_wait; -+ -+ struct closure io; -+ struct delayed_work write_work; -+ -+ /* Sequence number of most recent journal entry (last entry in @pin) */ -+ atomic64_t seq; -+ -+ /* seq, last_seq from the most recent journal entry successfully written */ -+ u64 seq_ondisk; -+ u64 flushed_seq_ondisk; -+ u64 last_seq_ondisk; -+ u64 err_seq; -+ u64 last_empty_seq; -+ -+ /* -+ * FIFO of journal entries whose btree updates have not yet been -+ * written out. -+ * -+ * Each entry is a reference count. The position in the FIFO is the -+ * entry's sequence number relative to @seq. -+ * -+ * The journal entry itself holds a reference count, put when the -+ * journal entry is written out. Each btree node modified by the journal -+ * entry also holds a reference count, put when the btree node is -+ * written. -+ * -+ * When a reference count reaches zero, the journal entry is no longer -+ * needed. When all journal entries in the oldest journal bucket are no -+ * longer needed, the bucket can be discarded and reused. -+ */ -+ struct { -+ u64 front, back, size, mask; -+ struct journal_entry_pin_list *data; -+ } pin; -+ -+ struct journal_space space[journal_space_nr]; -+ -+ u64 replay_journal_seq; -+ u64 replay_journal_seq_end; -+ -+ struct write_point wp; -+ spinlock_t err_lock; -+ -+ struct mutex reclaim_lock; -+ /* -+ * Used for waiting until journal reclaim has freed up space in the -+ * journal: -+ */ -+ wait_queue_head_t reclaim_wait; -+ struct task_struct *reclaim_thread; -+ bool reclaim_kicked; -+ unsigned long next_reclaim; -+ u64 nr_direct_reclaim; -+ u64 nr_background_reclaim; -+ -+ unsigned long last_flushed; -+ struct journal_entry_pin *flush_in_progress; -+ bool flush_in_progress_dropped; -+ wait_queue_head_t pin_flush_wait; -+ -+ /* protects advancing ja->discard_idx: */ -+ struct mutex discard_lock; -+ bool can_discard; -+ -+ unsigned long last_flush_write; -+ -+ u64 res_get_blocked_start; -+ u64 write_start_time; -+ -+ u64 nr_flush_writes; -+ u64 nr_noflush_writes; -+ -+ struct bch2_time_stats *flush_write_time; -+ struct bch2_time_stats *noflush_write_time; -+ struct bch2_time_stats *blocked_time; -+ struct bch2_time_stats *flush_seq_time; -+ -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map res_map; -+#endif -+} __aligned(SMP_CACHE_BYTES); -+ -+/* -+ * Embedded in struct bch_dev. First three fields refer to the array of journal -+ * buckets, in bch_sb. -+ */ -+struct journal_device { -+ /* -+ * For each journal bucket, contains the max sequence number of the -+ * journal writes it contains - so we know when a bucket can be reused. -+ */ -+ u64 *bucket_seq; -+ -+ unsigned sectors_free; -+ -+ /* -+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: -+ */ -+ unsigned discard_idx; /* Next bucket to discard */ -+ unsigned dirty_idx_ondisk; -+ unsigned dirty_idx; -+ unsigned cur_idx; /* Journal bucket we're currently writing to */ -+ unsigned nr; -+ -+ u64 *buckets; -+ -+ /* Bio for journal reads/writes to this device */ -+ struct bio *bio; -+ -+ /* for bch_journal_read_device */ -+ struct closure read; -+}; -+ -+/* -+ * journal_entry_res - reserve space in every journal entry: -+ */ -+struct journal_entry_res { -+ unsigned u64s; -+}; -+ -+#endif /* _BCACHEFS_JOURNAL_TYPES_H */ -diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c -new file mode 100644 -index 000000000..5699cd487 ---- /dev/null -+++ b/fs/bcachefs/keylist.c -@@ -0,0 +1,52 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "keylist.h" -+ -+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, -+ size_t nr_inline_u64s, size_t new_u64s) -+{ -+ size_t oldsize = bch2_keylist_u64s(l); -+ size_t newsize = oldsize + new_u64s; -+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; -+ u64 *new_keys; -+ -+ newsize = roundup_pow_of_two(newsize); -+ -+ if (newsize <= nr_inline_u64s || -+ (old_buf && roundup_pow_of_two(oldsize) == newsize)) -+ return 0; -+ -+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); -+ if (!new_keys) -+ return -ENOMEM; -+ -+ if (!old_buf) -+ memcpy_u64s(new_keys, inline_u64s, oldsize); -+ -+ l->keys_p = new_keys; -+ l->top_p = new_keys + oldsize; -+ -+ return 0; -+} -+ -+void bch2_keylist_pop_front(struct keylist *l) -+{ -+ l->top_p -= bch2_keylist_front(l)->k.u64s; -+ -+ memmove_u64s_down(l->keys, -+ bkey_next(l->keys), -+ bch2_keylist_u64s(l)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *l) -+{ -+ struct bkey_i *k; -+ -+ for_each_keylist_key(l, k) -+ BUG_ON(bkey_next(k) != l->top && -+ bpos_ge(k->k.p, bkey_next(k)->k.p)); -+} -+#endif -diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h -new file mode 100644 -index 000000000..fe759c703 ---- /dev/null -+++ b/fs/bcachefs/keylist.h -@@ -0,0 +1,74 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_H -+#define _BCACHEFS_KEYLIST_H -+ -+#include "keylist_types.h" -+ -+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -+void bch2_keylist_pop_front(struct keylist *); -+ -+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -+{ -+ l->top_p = l->keys_p = inline_keys; -+} -+ -+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -+{ -+ if (l->keys_p != inline_keys) -+ kfree(l->keys_p); -+} -+ -+static inline void bch2_keylist_push(struct keylist *l) -+{ -+ l->top = bkey_next(l->top); -+} -+ -+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -+{ -+ bkey_copy(l->top, k); -+ bch2_keylist_push(l); -+} -+ -+static inline bool bch2_keylist_empty(struct keylist *l) -+{ -+ return l->top == l->keys; -+} -+ -+static inline size_t bch2_keylist_u64s(struct keylist *l) -+{ -+ return l->top_p - l->keys_p; -+} -+ -+static inline size_t bch2_keylist_bytes(struct keylist *l) -+{ -+ return bch2_keylist_u64s(l) * sizeof(u64); -+} -+ -+static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -+{ -+ return l->keys; -+} -+ -+#define for_each_keylist_key(_keylist, _k) \ -+ for (_k = (_keylist)->keys; \ -+ _k != (_keylist)->top; \ -+ _k = bkey_next(_k)) -+ -+static inline u64 keylist_sectors(struct keylist *keys) -+{ -+ struct bkey_i *k; -+ u64 ret = 0; -+ -+ for_each_keylist_key(keys, k) -+ ret += k->k.size; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *); -+#else -+static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -+#endif -+ -+#endif /* _BCACHEFS_KEYLIST_H */ -diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h -new file mode 100644 -index 000000000..4b3ff7d8a ---- /dev/null -+++ b/fs/bcachefs/keylist_types.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_TYPES_H -+#define _BCACHEFS_KEYLIST_TYPES_H -+ -+struct keylist { -+ union { -+ struct bkey_i *keys; -+ u64 *keys_p; -+ }; -+ union { -+ struct bkey_i *top; -+ u64 *top_p; -+ }; -+}; -+ -+#endif /* _BCACHEFS_KEYLIST_TYPES_H */ -diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c -new file mode 100644 -index 000000000..3e8b8f2f3 ---- /dev/null -+++ b/fs/bcachefs/lru.c -@@ -0,0 +1,162 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "btree_write_buffer.h" -+#include "error.h" -+#include "lru.h" -+#include "recovery.h" -+ -+/* KEY_TYPE_lru is obsolete: */ -+int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (!lru_pos_time(k.k->p)) { -+ prt_printf(err, "lru entry at time=0"); -+ return -BCH_ERR_invalid_bkey; -+ -+ } -+ -+ return 0; -+} -+ -+void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bch_lru *lru = bkey_s_c_to_lru(k).v; -+ -+ prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); -+} -+ -+void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) -+{ -+ prt_printf(out, "%llu:%llu -> %llu:%llu", -+ lru_pos_id(lru), -+ lru_pos_time(lru), -+ u64_to_bucket(lru.offset).inode, -+ u64_to_bucket(lru.offset).offset); -+} -+ -+static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, -+ u64 dev_bucket, u64 time, bool set) -+{ -+ return time -+ ? bch2_btree_bit_mod(trans, BTREE_ID_lru, -+ lru_pos(lru_id, dev_bucket, time), set) -+ : 0; -+} -+ -+int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -+{ -+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); -+} -+ -+int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) -+{ -+ return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); -+} -+ -+int bch2_lru_change(struct btree_trans *trans, -+ u16 lru_id, u64 dev_bucket, -+ u64 old_time, u64 new_time) -+{ -+ if (old_time == new_time) -+ return 0; -+ -+ return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: -+ bch2_lru_set(trans, lru_id, dev_bucket, new_time); -+} -+ -+static const char * const bch2_lru_types[] = { -+#define x(n) #n, -+ BCH_LRU_TYPES() -+#undef x -+ NULL -+}; -+ -+static int bch2_check_lru_key(struct btree_trans *trans, -+ struct btree_iter *lru_iter, -+ struct bkey_s_c lru_k, -+ struct bpos *last_flushed_pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ enum bch_lru_type type = lru_type(lru_k); -+ struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); -+ u64 idx; -+ int ret; -+ -+ if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, -+ "lru key points to nonexistent device:bucket %llu:%llu", -+ alloc_pos.inode, alloc_pos.offset)) -+ return bch2_btree_delete_at(trans, lru_iter, 0); -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ a = bch2_alloc_to_v4(k, &a_convert); -+ -+ switch (type) { -+ case BCH_LRU_read: -+ idx = alloc_lru_idx_read(*a); -+ break; -+ case BCH_LRU_fragmentation: -+ idx = a->fragmentation_lru; -+ break; -+ } -+ -+ if (lru_k.k->type != KEY_TYPE_set || -+ lru_pos_time(lru_k.k->p) != idx) { -+ if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { -+ *last_flushed_pos = lru_k.k->p; -+ ret = bch2_btree_write_buffer_flush_sync(trans) ?: -+ -BCH_ERR_transaction_restart_write_buffer_flush; -+ goto out; -+ } -+ -+ if (c->opts.reconstruct_alloc || -+ fsck_err(c, "incorrect lru entry: lru %s time %llu\n" -+ " %s\n" -+ " for %s", -+ bch2_lru_types[type], -+ lru_pos_time(lru_k.k->p), -+ (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), -+ (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) -+ ret = bch2_btree_delete_at(trans, lru_iter, 0); -+ } -+out: -+err: -+fsck_err: -+ bch2_trans_iter_exit(trans, &iter); -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+int bch2_check_lrus(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bpos last_flushed_pos = POS_MIN; -+ int ret = 0; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+ -+} -diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h -new file mode 100644 -index 000000000..be66bf9ad ---- /dev/null -+++ b/fs/bcachefs/lru.h -@@ -0,0 +1,69 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_LRU_H -+#define _BCACHEFS_LRU_H -+ -+#define LRU_TIME_BITS 48 -+#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) -+ -+static inline u64 lru_pos_id(struct bpos pos) -+{ -+ return pos.inode >> LRU_TIME_BITS; -+} -+ -+static inline u64 lru_pos_time(struct bpos pos) -+{ -+ return pos.inode & ~(~0ULL << LRU_TIME_BITS); -+} -+ -+static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) -+{ -+ struct bpos pos = POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); -+ -+ EBUG_ON(time > LRU_TIME_MAX); -+ EBUG_ON(lru_pos_id(pos) != lru_id); -+ EBUG_ON(lru_pos_time(pos) != time); -+ EBUG_ON(pos.offset != dev_bucket); -+ -+ return pos; -+} -+ -+#define BCH_LRU_TYPES() \ -+ x(read) \ -+ x(fragmentation) -+ -+enum bch_lru_type { -+#define x(n) BCH_LRU_##n, -+ BCH_LRU_TYPES() -+#undef x -+}; -+ -+#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) -+ -+static inline enum bch_lru_type lru_type(struct bkey_s_c l) -+{ -+ u16 lru_id = l.k->p.inode >> 48; -+ -+ if (lru_id == BCH_LRU_FRAGMENTATION_START) -+ return BCH_LRU_fragmentation; -+ return BCH_LRU_read; -+} -+ -+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+void bch2_lru_pos_to_text(struct printbuf *, struct bpos); -+ -+#define bch2_bkey_ops_lru ((struct bkey_ops) { \ -+ .key_invalid = bch2_lru_invalid, \ -+ .val_to_text = bch2_lru_to_text, \ -+ .min_val_size = 8, \ -+}) -+ -+int bch2_lru_del(struct btree_trans *, u16, u64, u64); -+int bch2_lru_set(struct btree_trans *, u16, u64, u64); -+int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); -+ -+int bch2_check_lrus(struct bch_fs *); -+ -+#endif /* _BCACHEFS_LRU_H */ -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -new file mode 100644 -index 000000000..81c8cdbac ---- /dev/null -+++ b/fs/bcachefs/migrate.c -@@ -0,0 +1,182 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for moving data off a device. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "errcode.h" -+#include "extents.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "migrate.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, -+ unsigned dev_idx, int flags, bool metadata) -+{ -+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; -+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; -+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; -+ unsigned nr_good; -+ -+ bch2_bkey_drop_device(k, dev_idx); -+ -+ nr_good = bch2_bkey_durability(c, k.s_c); -+ if ((!nr_good && !(flags & lost)) || -+ (nr_good < replicas && !(flags & degraded))) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ unsigned dev_idx, -+ int flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i *n; -+ int ret; -+ -+ if (!bch2_bkey_has_device_c(k, dev_idx)) -+ return 0; -+ -+ n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ return ret; -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); -+ if (ret) -+ return ret; -+ -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(n)); -+ -+ /* -+ * Since we're not inserting through an extent iterator -+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), -+ * we aren't using the extent overwrite path to delete, we're -+ * just using the normal key deletion path: -+ */ -+ if (bkey_deleted(&n->k)) -+ n->k.size = 0; -+ return 0; -+} -+ -+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ enum btree_id id; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ if (!btree_type_has_ptrs(id)) -+ continue; -+ -+ ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct closure cl; -+ struct btree *b; -+ struct bkey_buf k; -+ unsigned id; -+ int ret; -+ -+ /* don't handle this yet: */ -+ if (flags & BCH_FORCE_IF_METADATA_LOST) -+ return -EINVAL; -+ -+ bch2_bkey_buf_init(&k); -+ bch2_trans_init(&trans, c, 0, 0); -+ closure_init_stack(&cl); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, -+ BTREE_ITER_PREFETCH); -+retry: -+ ret = 0; -+ while (bch2_trans_begin(&trans), -+ (b = bch2_btree_iter_peek_node(&iter)) && -+ !(ret = PTR_ERR_OR_ZERO(b))) { -+ if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) -+ goto next; -+ -+ bch2_bkey_buf_copy(&k, c, &b->key); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), -+ dev_idx, flags, true); -+ if (ret) { -+ bch_err(c, "Cannot drop device without losing data"); -+ break; -+ } -+ -+ ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { -+ ret = 0; -+ continue; -+ } -+ -+ if (ret) { -+ bch_err(c, "Error updating btree node key: %s", -+ bch2_err_str(ret)); -+ break; -+ } -+next: -+ bch2_btree_iter_next_node(&iter); -+ } -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) -+ goto err; -+ } -+ -+ bch2_btree_interior_updates_flush(c); -+ ret = 0; -+err: -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&k, c); -+ -+ BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); -+ -+ return ret; -+} -+ -+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, dev_idx, flags); -+} -diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h -new file mode 100644 -index 000000000..027efaa0d ---- /dev/null -+++ b/fs/bcachefs/migrate.h -@@ -0,0 +1,7 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MIGRATE_H -+#define _BCACHEFS_MIGRATE_H -+ -+int bch2_dev_data_drop(struct bch_fs *, unsigned, int); -+ -+#endif /* _BCACHEFS_MIGRATE_H */ -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -new file mode 100644 -index 000000000..fb76a1dac ---- /dev/null -+++ b/fs/bcachefs/move.c -@@ -0,0 +1,1162 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "backpointers.h" -+#include "bkey_buf.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_write_buffer.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "errcode.h" -+#include "error.h" -+#include "inode.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+#include -+ -+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (trace_move_extent_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ trace_move_extent(c, buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+ -+static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (trace_move_extent_read_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ trace_move_extent_read(c, buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+ -+static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (trace_move_extent_alloc_mem_fail_enabled()) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_bkey_val_to_text(&buf, c, k); -+ trace_move_extent_alloc_mem_fail(c, buf.buf); -+ printbuf_exit(&buf); -+ } -+} -+ -+static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) -+{ -+ mutex_lock(&c->data_progress_lock); -+ list_add(&stats->list, &c->data_progress_list); -+ mutex_unlock(&c->data_progress_lock); -+} -+ -+static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) -+{ -+ mutex_lock(&c->data_progress_lock); -+ list_del(&stats->list); -+ mutex_unlock(&c->data_progress_lock); -+} -+ -+struct moving_io { -+ struct list_head read_list; -+ struct list_head io_list; -+ struct move_bucket_in_flight *b; -+ struct closure cl; -+ bool read_completed; -+ -+ unsigned read_sectors; -+ unsigned write_sectors; -+ -+ struct bch_read_bio rbio; -+ -+ struct data_update write; -+ /* Must be last since it is variable size */ -+ struct bio_vec bi_inline_vecs[0]; -+}; -+ -+static void move_free(struct moving_io *io) -+{ -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ if (io->b) -+ atomic_dec(&io->b->count); -+ -+ bch2_data_update_exit(&io->write); -+ -+ mutex_lock(&ctxt->lock); -+ list_del(&io->io_list); -+ wake_up(&ctxt->wait); -+ mutex_unlock(&ctxt->lock); -+ -+ kfree(io); -+} -+ -+static void move_write_done(struct bch_write_op *op) -+{ -+ struct moving_io *io = container_of(op, struct moving_io, write.op); -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ if (io->write.op.error) -+ ctxt->write_error = true; -+ -+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -+ atomic_dec(&io->write.ctxt->write_ios); -+ move_free(io); -+ closure_put(&ctxt->cl); -+} -+ -+static void move_write(struct moving_io *io) -+{ -+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ move_free(io); -+ return; -+ } -+ -+ closure_get(&io->write.ctxt->cl); -+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); -+ atomic_inc(&io->write.ctxt->write_ios); -+ -+ bch2_data_update_read_done(&io->write, io->rbio.pick.crc); -+} -+ -+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) -+{ -+ struct moving_io *io = -+ list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); -+ -+ return io && io->read_completed ? io : NULL; -+} -+ -+static void move_read_endio(struct bio *bio) -+{ -+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ atomic_sub(io->read_sectors, &ctxt->read_sectors); -+ atomic_dec(&ctxt->read_ios); -+ io->read_completed = true; -+ -+ wake_up(&ctxt->wait); -+ closure_put(&ctxt->cl); -+} -+ -+void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, -+ struct btree_trans *trans) -+{ -+ struct moving_io *io; -+ -+ if (trans) -+ bch2_trans_unlock(trans); -+ -+ while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { -+ list_del(&io->read_list); -+ move_write(io); -+ } -+} -+ -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, -+ struct btree_trans *trans) -+{ -+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); -+ -+ move_ctxt_wait_event(ctxt, trans, -+ !atomic_read(&ctxt->write_sectors) || -+ atomic_read(&ctxt->write_sectors) != sectors_pending); -+} -+ -+void bch2_moving_ctxt_exit(struct moving_context *ctxt) -+{ -+ struct bch_fs *c = ctxt->c; -+ -+ move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); -+ closure_sync(&ctxt->cl); -+ -+ EBUG_ON(atomic_read(&ctxt->write_sectors)); -+ EBUG_ON(atomic_read(&ctxt->write_ios)); -+ EBUG_ON(atomic_read(&ctxt->read_sectors)); -+ EBUG_ON(atomic_read(&ctxt->read_ios)); -+ -+ if (ctxt->stats) { -+ progress_list_del(c, ctxt->stats); -+ trace_move_data(c, -+ atomic64_read(&ctxt->stats->sectors_moved), -+ atomic64_read(&ctxt->stats->keys_moved)); -+ } -+ -+ mutex_lock(&c->moving_context_lock); -+ list_del(&ctxt->list); -+ mutex_unlock(&c->moving_context_lock); -+} -+ -+void bch2_moving_ctxt_init(struct moving_context *ctxt, -+ struct bch_fs *c, -+ struct bch_ratelimit *rate, -+ struct bch_move_stats *stats, -+ struct write_point_specifier wp, -+ bool wait_on_copygc) -+{ -+ memset(ctxt, 0, sizeof(*ctxt)); -+ -+ ctxt->c = c; -+ ctxt->fn = (void *) _RET_IP_; -+ ctxt->rate = rate; -+ ctxt->stats = stats; -+ ctxt->wp = wp; -+ ctxt->wait_on_copygc = wait_on_copygc; -+ -+ closure_init_stack(&ctxt->cl); -+ -+ mutex_init(&ctxt->lock); -+ INIT_LIST_HEAD(&ctxt->reads); -+ INIT_LIST_HEAD(&ctxt->ios); -+ init_waitqueue_head(&ctxt->wait); -+ -+ mutex_lock(&c->moving_context_lock); -+ list_add(&ctxt->list, &c->moving_context_list); -+ mutex_unlock(&c->moving_context_lock); -+ -+ if (stats) { -+ progress_list_add(c, stats); -+ stats->data_type = BCH_DATA_user; -+ } -+} -+ -+void bch2_move_stats_init(struct bch_move_stats *stats, char *name) -+{ -+ memset(stats, 0, sizeof(*stats)); -+ scnprintf(stats->name, sizeof(stats->name), "%s", name); -+} -+ -+static int bch2_extent_drop_ptrs(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct data_update_opts data_opts) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i *n; -+ int ret; -+ -+ n = bch2_bkey_make_mut_noupdate(trans, k); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ return ret; -+ -+ while (data_opts.kill_ptrs) { -+ unsigned i = 0, drop = __fls(data_opts.kill_ptrs); -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); -+ data_opts.kill_ptrs ^= 1U << drop; -+ } -+ -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(n)); -+ -+ /* -+ * Since we're not inserting through an extent iterator -+ * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), -+ * we aren't using the extent overwrite path to delete, we're -+ * just using the normal key deletion path: -+ */ -+ if (bkey_deleted(&n->k)) -+ n->k.size = 0; -+ -+ return bch2_trans_relock(trans) ?: -+ bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); -+} -+ -+static int bch2_move_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct moving_context *ctxt, -+ struct move_bucket_in_flight *bucket_in_flight, -+ struct bch_io_opts io_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct data_update_opts data_opts) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct moving_io *io; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned sectors = k.k->size, pages; -+ int ret = -ENOMEM; -+ -+ trace_move_extent2(c, k); -+ -+ bch2_data_update_opts_normalize(k, &data_opts); -+ -+ if (!data_opts.rewrite_ptrs && -+ !data_opts.extra_replicas) { -+ if (data_opts.kill_ptrs) -+ return bch2_extent_drop_ptrs(trans, iter, k, data_opts); -+ return 0; -+ } -+ -+ /* -+ * Before memory allocations & taking nocow locks in -+ * bch2_data_update_init(): -+ */ -+ bch2_trans_unlock(trans); -+ -+ /* write path might have to decompress data: */ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -+ -+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ io = kzalloc(sizeof(struct moving_io) + -+ sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ if (!io) -+ goto err; -+ -+ INIT_LIST_HEAD(&io->io_list); -+ io->write.ctxt = ctxt; -+ io->read_sectors = k.k->size; -+ io->write_sectors = k.k->size; -+ -+ bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); -+ bio_set_prio(&io->write.op.wbio.bio, -+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ -+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -+ GFP_KERNEL)) -+ goto err_free; -+ -+ io->rbio.c = c; -+ io->rbio.opts = io_opts; -+ bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); -+ io->rbio.bio.bi_vcnt = pages; -+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ -+ io->rbio.bio.bi_opf = REQ_OP_READ; -+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -+ io->rbio.bio.bi_end_io = move_read_endio; -+ -+ ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, -+ io_opts, data_opts, btree_id, k); -+ if (ret && ret != -BCH_ERR_unwritten_extent_update) -+ goto err_free_pages; -+ -+ if (ret == -BCH_ERR_unwritten_extent_update) { -+ bch2_update_unwritten_extent(trans, &io->write); -+ move_free(io); -+ return 0; -+ } -+ -+ BUG_ON(ret); -+ -+ io->write.ctxt = ctxt; -+ io->write.op.end_io = move_write_done; -+ -+ if (ctxt->stats) { -+ atomic64_inc(&ctxt->stats->keys_moved); -+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); -+ } -+ -+ if (bucket_in_flight) { -+ io->b = bucket_in_flight; -+ atomic_inc(&io->b->count); -+ } -+ -+ this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); -+ this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); -+ trace_move_extent_read2(c, k); -+ -+ mutex_lock(&ctxt->lock); -+ atomic_add(io->read_sectors, &ctxt->read_sectors); -+ atomic_inc(&ctxt->read_ios); -+ -+ list_add_tail(&io->read_list, &ctxt->reads); -+ list_add_tail(&io->io_list, &ctxt->ios); -+ mutex_unlock(&ctxt->lock); -+ -+ /* -+ * dropped by move_read_endio() - guards against use after free of -+ * ctxt when doing wakeup -+ */ -+ closure_get(&ctxt->cl); -+ bch2_read_extent(trans, &io->rbio, -+ bkey_start_pos(k.k), -+ btree_id, k, 0, -+ BCH_READ_NODECODE| -+ BCH_READ_LAST_FRAGMENT); -+ return 0; -+err_free_pages: -+ bio_free_pages(&io->write.op.wbio.bio); -+err_free: -+ kfree(io); -+err: -+ this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); -+ trace_move_extent_alloc_mem_fail2(c, k); -+ return ret; -+} -+ -+static int lookup_inode(struct btree_trans *trans, struct bpos pos, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, -+ BTREE_ITER_ALL_SNAPSHOTS); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k || !bkey_eq(k.k->p, pos)) { -+ ret = -BCH_ERR_ENOENT_inode; -+ goto err; -+ } -+ -+ ret = bkey_is_inode(k.k) ? 0 : -EIO; -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_unpack(k, inode); -+ if (ret) -+ goto err; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int move_ratelimit(struct btree_trans *trans, -+ struct moving_context *ctxt) -+{ -+ struct bch_fs *c = trans->c; -+ u64 delay; -+ -+ if (ctxt->wait_on_copygc) { -+ bch2_trans_unlock(trans); -+ wait_event_killable(c->copygc_running_wq, -+ !c->copygc_running || -+ kthread_should_stop()); -+ } -+ -+ do { -+ delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; -+ -+ if (delay) { -+ bch2_trans_unlock(trans); -+ set_current_state(TASK_INTERRUPTIBLE); -+ } -+ -+ if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { -+ __set_current_state(TASK_RUNNING); -+ return 1; -+ } -+ -+ if (delay) -+ schedule_timeout(delay); -+ -+ if (unlikely(freezing(current))) { -+ move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); -+ try_to_freeze(); -+ } -+ } while (delay); -+ -+ /* -+ * XXX: these limits really ought to be per device, SSDs and hard drives -+ * will want different limits -+ */ -+ move_ctxt_wait_event(ctxt, trans, -+ atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && -+ atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && -+ atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && -+ atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); -+ -+ return 0; -+} -+ -+static int move_get_io_opts(struct btree_trans *trans, -+ struct bch_io_opts *io_opts, -+ struct bkey_s_c k, u64 *cur_inum) -+{ -+ struct bch_inode_unpacked inode; -+ int ret; -+ -+ if (*cur_inum == k.k->p.inode) -+ return 0; -+ -+ ret = lookup_inode(trans, -+ SPOS(0, k.k->p.inode, k.k->p.snapshot), -+ &inode); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ return ret; -+ -+ if (!ret) -+ bch2_inode_opts_get(io_opts, trans->c, &inode); -+ else -+ *io_opts = bch2_opts_to_inode_opts(trans->c->opts); -+ *cur_inum = k.k->p.inode; -+ return 0; -+} -+ -+static int __bch2_move_data(struct moving_context *ctxt, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ enum btree_id btree_id) -+{ -+ struct bch_fs *c = ctxt->c; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct bkey_buf sk; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct data_update_opts data_opts; -+ u64 cur_inum = U64_MAX; -+ int ret = 0, ret2; -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ if (ctxt->stats) { -+ ctxt->stats->data_type = BCH_DATA_user; -+ ctxt->stats->btree_id = btree_id; -+ ctxt->stats->pos = start; -+ } -+ -+ bch2_trans_iter_init(&trans, &iter, btree_id, start, -+ BTREE_ITER_PREFETCH| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ -+ if (ctxt->rate) -+ bch2_ratelimit_reset(ctxt->rate); -+ -+ while (!move_ratelimit(&trans, ctxt)) { -+ bch2_trans_begin(&trans); -+ -+ k = bch2_btree_iter_peek(&iter); -+ if (!k.k) -+ break; -+ -+ ret = bkey_err(k); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_ge(bkey_start_pos(k.k), end)) -+ break; -+ -+ if (ctxt->stats) -+ ctxt->stats->pos = iter.pos; -+ -+ if (!bkey_extent_is_direct_data(k.k)) -+ goto next_nondata; -+ -+ ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); -+ if (ret) -+ continue; -+ -+ memset(&data_opts, 0, sizeof(data_opts)); -+ if (!pred(c, arg, k, &io_opts, &data_opts)) -+ goto next; -+ -+ /* -+ * The iterator gets unlocked by __bch2_read_extent - need to -+ * save a copy of @k elsewhere: -+ */ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL, -+ io_opts, btree_id, k, data_opts); -+ if (ret2) { -+ if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) -+ continue; -+ -+ if (ret2 == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt, &trans); -+ continue; -+ } -+ -+ /* XXX signal failure */ -+ goto next; -+ } -+ -+ if (ctxt->rate) -+ bch2_ratelimit_increment(ctxt->rate, k.k->size); -+next: -+ if (ctxt->stats) -+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -+next_nondata: -+ bch2_btree_iter_advance(&iter); -+ } -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&sk, c); -+ -+ return ret; -+} -+ -+int bch2_move_data(struct bch_fs *c, -+ enum btree_id start_btree_id, struct bpos start_pos, -+ enum btree_id end_btree_id, struct bpos end_pos, -+ struct bch_ratelimit *rate, -+ struct bch_move_stats *stats, -+ struct write_point_specifier wp, -+ bool wait_on_copygc, -+ move_pred_fn pred, void *arg) -+{ -+ struct moving_context ctxt; -+ enum btree_id id; -+ int ret; -+ -+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ -+ for (id = start_btree_id; -+ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); -+ id++) { -+ stats->btree_id = id; -+ -+ if (id != BTREE_ID_extents && -+ id != BTREE_ID_reflink) -+ continue; -+ -+ if (!bch2_btree_id_root(c, id)->b) -+ continue; -+ -+ ret = __bch2_move_data(&ctxt, -+ id == start_btree_id ? start_pos : POS_MIN, -+ id == end_btree_id ? end_pos : POS_MAX, -+ pred, arg, id); -+ if (ret) -+ break; -+ } -+ -+ bch2_moving_ctxt_exit(&ctxt); -+ -+ return ret; -+} -+ -+int __bch2_evacuate_bucket(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct move_bucket_in_flight *bucket_in_flight, -+ struct bpos bucket, int gen, -+ struct data_update_opts _data_opts) -+{ -+ struct bch_fs *c = ctxt->c; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_iter iter; -+ struct bkey_buf sk; -+ struct bch_backpointer bp; -+ struct bch_alloc_v4 a_convert; -+ const struct bch_alloc_v4 *a; -+ struct bkey_s_c k; -+ struct data_update_opts data_opts; -+ unsigned dirty_sectors, bucket_size; -+ u64 fragmentation; -+ u64 cur_inum = U64_MAX; -+ struct bpos bp_pos = POS_MIN; -+ int ret = 0; -+ -+ trace_bucket_evacuate(c, &bucket); -+ -+ bch2_bkey_buf_init(&sk); -+ -+ /* -+ * We're not run in a context that handles transaction restarts: -+ */ -+ bch2_trans_begin(trans); -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, -+ bucket, BTREE_ITER_CACHED); -+ ret = lockrestart_do(trans, -+ bkey_err(k = bch2_btree_iter_peek_slot(&iter))); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (ret) { -+ bch_err_msg(c, ret, "looking up alloc key"); -+ goto err; -+ } -+ -+ a = bch2_alloc_to_v4(k, &a_convert); -+ dirty_sectors = a->dirty_sectors; -+ bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; -+ fragmentation = a->fragmentation_lru; -+ -+ ret = bch2_btree_write_buffer_flush(trans); -+ if (ret) { -+ bch_err_msg(c, ret, "flushing btree write buffer"); -+ goto err; -+ } -+ -+ while (!(ret = move_ratelimit(trans, ctxt))) { -+ bch2_trans_begin(trans); -+ -+ ret = bch2_get_next_backpointer(trans, bucket, gen, -+ &bp_pos, &bp, -+ BTREE_ITER_CACHED); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ if (bkey_eq(bp_pos, POS_MAX)) -+ break; -+ -+ if (!bp.level) { -+ const struct bch_extent_ptr *ptr; -+ struct bkey_s_c k; -+ unsigned i = 0; -+ -+ k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); -+ ret = bkey_err(k); -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ if (!k.k) -+ goto next; -+ -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); -+ if (ret) { -+ bch2_trans_iter_exit(trans, &iter); -+ continue; -+ } -+ -+ data_opts = _data_opts; -+ data_opts.target = io_opts.background_target; -+ data_opts.rewrite_ptrs = 0; -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { -+ if (ptr->dev == bucket.inode) { -+ data_opts.rewrite_ptrs |= 1U << i; -+ if (ptr->cached) { -+ bch2_trans_iter_exit(trans, &iter); -+ goto next; -+ } -+ } -+ i++; -+ } -+ -+ ret = bch2_move_extent(trans, &iter, ctxt, -+ bucket_in_flight, -+ io_opts, bp.btree_id, k, data_opts); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt, trans); -+ continue; -+ } -+ if (ret) -+ goto err; -+ -+ if (ctxt->rate) -+ bch2_ratelimit_increment(ctxt->rate, k.k->size); -+ if (ctxt->stats) -+ atomic64_add(k.k->size, &ctxt->stats->sectors_seen); -+ } else { -+ struct btree *b; -+ -+ b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); -+ ret = PTR_ERR_OR_ZERO(b); -+ if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) -+ continue; -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ if (!b) -+ goto next; -+ -+ ret = bch2_btree_node_rewrite(trans, &iter, b, 0); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ goto err; -+ -+ if (ctxt->rate) -+ bch2_ratelimit_increment(ctxt->rate, -+ c->opts.btree_node_size >> 9); -+ if (ctxt->stats) { -+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); -+ atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); -+ } -+ } -+next: -+ bp_pos = bpos_nosnap_successor(bp_pos); -+ } -+ -+ trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); -+err: -+ bch2_bkey_buf_exit(&sk, c); -+ return ret; -+} -+ -+int bch2_evacuate_bucket(struct bch_fs *c, -+ struct bpos bucket, int gen, -+ struct data_update_opts data_opts, -+ struct bch_ratelimit *rate, -+ struct bch_move_stats *stats, -+ struct write_point_specifier wp, -+ bool wait_on_copygc) -+{ -+ struct btree_trans trans; -+ struct moving_context ctxt; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); -+ ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts); -+ bch2_moving_ctxt_exit(&ctxt); -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+typedef bool (*move_btree_pred)(struct bch_fs *, void *, -+ struct btree *, struct bch_io_opts *, -+ struct data_update_opts *); -+ -+static int bch2_move_btree(struct bch_fs *c, -+ enum btree_id start_btree_id, struct bpos start_pos, -+ enum btree_id end_btree_id, struct bpos end_pos, -+ move_btree_pred pred, void *arg, -+ struct bch_move_stats *stats) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct btree *b; -+ enum btree_id id; -+ struct data_update_opts data_opts; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ progress_list_add(c, stats); -+ -+ stats->data_type = BCH_DATA_btree; -+ -+ for (id = start_btree_id; -+ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); -+ id++) { -+ stats->btree_id = id; -+ -+ if (!bch2_btree_id_root(c, id)->b) -+ continue; -+ -+ bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, -+ BTREE_ITER_PREFETCH); -+retry: -+ ret = 0; -+ while (bch2_trans_begin(&trans), -+ (b = bch2_btree_iter_peek_node(&iter)) && -+ !(ret = PTR_ERR_OR_ZERO(b))) { -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if ((cmp_int(id, end_btree_id) ?: -+ bpos_cmp(b->key.k.p, end_pos)) > 0) -+ break; -+ -+ stats->pos = iter.pos; -+ -+ if (!pred(c, arg, b, &io_opts, &data_opts)) -+ goto next; -+ -+ ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ continue; -+ if (ret) -+ break; -+next: -+ bch2_btree_iter_next_node(&iter); -+ } -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (kthread && kthread_should_stop()) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ bch2_btree_interior_updates_flush(c); -+ -+ progress_list_del(c, stats); -+ return ret; -+} -+ -+static bool rereplicate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ unsigned nr_good = bch2_bkey_durability(c, k); -+ unsigned replicas = bkey_is_btree_ptr(k.k) -+ ? c->opts.metadata_replicas -+ : io_opts->data_replicas; -+ -+ if (!nr_good || nr_good >= replicas) -+ return false; -+ -+ data_opts->target = 0; -+ data_opts->extra_replicas = replicas - nr_good; -+ data_opts->btree_insert_flags = 0; -+ return true; -+} -+ -+static bool migrate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ struct bch_ioctl_data *op = arg; -+ unsigned i = 0; -+ -+ data_opts->rewrite_ptrs = 0; -+ data_opts->target = 0; -+ data_opts->extra_replicas = 0; -+ data_opts->btree_insert_flags = 0; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (ptr->dev == op->migrate.dev) -+ data_opts->rewrite_ptrs |= 1U << i; -+ i++; -+ } -+ -+ return data_opts->rewrite_ptrs != 0; -+} -+ -+static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, -+ struct btree *b, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -+} -+ -+static bool migrate_btree_pred(struct bch_fs *c, void *arg, -+ struct btree *b, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -+} -+ -+static bool bformat_needs_redo(struct bkey_format *f) -+{ -+ unsigned i; -+ -+ for (i = 0; i < f->nr_fields; i++) { -+ unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; -+ u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (f->bits_per_field[i] > unpacked_bits) -+ return true; -+ -+ if ((f->bits_per_field[i] == unpacked_bits) && field_offset) -+ return true; -+ -+ if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & -+ unpacked_mask) < -+ field_offset) -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, -+ struct btree *b, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ if (b->version_ondisk != c->sb.version || -+ btree_node_need_rewrite(b) || -+ bformat_needs_redo(&b->format)) { -+ data_opts->target = 0; -+ data_opts->extra_replicas = 0; -+ data_opts->btree_insert_flags = 0; -+ return true; -+ } -+ -+ return false; -+} -+ -+int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) -+{ -+ int ret; -+ -+ ret = bch2_move_btree(c, -+ 0, POS_MIN, -+ BTREE_ID_NR, SPOS_MAX, -+ rewrite_old_nodes_pred, c, stats); -+ if (!ret) { -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); -+ c->disk_sb.sb->version_min = c->disk_sb.sb->version; -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+int bch2_data_job(struct bch_fs *c, -+ struct bch_move_stats *stats, -+ struct bch_ioctl_data op) -+{ -+ int ret = 0; -+ -+ switch (op.op) { -+ case BCH_DATA_OP_REREPLICATE: -+ bch2_move_stats_init(stats, "rereplicate"); -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, -1); -+ -+ ret = bch2_move_btree(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, -+ rereplicate_btree_pred, c, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ true, -+ rereplicate_pred, c) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ case BCH_DATA_OP_MIGRATE: -+ if (op.migrate.dev >= c->sb.nr_devices) -+ return -EINVAL; -+ -+ bch2_move_stats_init(stats, "migrate"); -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -+ -+ ret = bch2_move_btree(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, -+ migrate_btree_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, -+ op.start_btree, op.start_pos, -+ op.end_btree, op.end_pos, -+ NULL, -+ stats, -+ writepoint_hashed((unsigned long) current), -+ true, -+ migrate_pred, &op) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ case BCH_DATA_OP_REWRITE_OLD_NODES: -+ bch2_move_stats_init(stats, "rewrite_old_nodes"); -+ ret = bch2_scan_old_btree_nodes(c, stats); -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ return ret; -+} -+ -+static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) -+{ -+ struct bch_move_stats *stats = ctxt->stats; -+ struct moving_io *io; -+ -+ prt_printf(out, "%s (%ps):", stats->name, ctxt->fn); -+ prt_newline(out); -+ -+ prt_printf(out, " data type %s btree_id %s position: ", -+ bch2_data_types[stats->data_type], -+ bch2_btree_ids[stats->btree_id]); -+ bch2_bpos_to_text(out, stats->pos); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "reads: ios %u/%u sectors %u/%u", -+ atomic_read(&ctxt->read_ios), -+ c->opts.move_ios_in_flight, -+ atomic_read(&ctxt->read_sectors), -+ c->opts.move_bytes_in_flight >> 9); -+ prt_newline(out); -+ -+ prt_printf(out, "writes: ios %u/%u sectors %u/%u", -+ atomic_read(&ctxt->write_ios), -+ c->opts.move_ios_in_flight, -+ atomic_read(&ctxt->write_sectors), -+ c->opts.move_bytes_in_flight >> 9); -+ prt_newline(out); -+ -+ printbuf_indent_add(out, 2); -+ -+ mutex_lock(&ctxt->lock); -+ list_for_each_entry(io, &ctxt->ios, io_list) -+ bch2_write_op_to_text(out, &io->write.op); -+ mutex_unlock(&ctxt->lock); -+ -+ printbuf_indent_sub(out, 4); -+} -+ -+void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct moving_context *ctxt; -+ -+ mutex_lock(&c->moving_context_lock); -+ list_for_each_entry(ctxt, &c->moving_context_list, list) -+ bch2_moving_ctxt_to_text(out, c, ctxt); -+ mutex_unlock(&c->moving_context_lock); -+} -+ -+void bch2_fs_move_init(struct bch_fs *c) -+{ -+ INIT_LIST_HEAD(&c->moving_context_list); -+ mutex_init(&c->moving_context_lock); -+ -+ INIT_LIST_HEAD(&c->data_progress_list); -+ mutex_init(&c->data_progress_lock); -+} -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -new file mode 100644 -index 000000000..c3136abe8 ---- /dev/null -+++ b/fs/bcachefs/move.h -@@ -0,0 +1,95 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_H -+#define _BCACHEFS_MOVE_H -+ -+#include "btree_iter.h" -+#include "buckets.h" -+#include "data_update.h" -+#include "move_types.h" -+ -+struct bch_read_bio; -+ -+struct moving_context { -+ struct bch_fs *c; -+ struct list_head list; -+ void *fn; -+ -+ struct bch_ratelimit *rate; -+ struct bch_move_stats *stats; -+ struct write_point_specifier wp; -+ bool wait_on_copygc; -+ bool write_error; -+ -+ /* For waiting on outstanding reads and writes: */ -+ struct closure cl; -+ -+ struct mutex lock; -+ struct list_head reads; -+ struct list_head ios; -+ -+ /* in flight sectors: */ -+ atomic_t read_sectors; -+ atomic_t write_sectors; -+ atomic_t read_ios; -+ atomic_t write_ios; -+ -+ wait_queue_head_t wait; -+}; -+ -+#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ -+do { \ -+ bool cond_finished = false; \ -+ bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ -+ \ -+ if (_cond) \ -+ break; \ -+ __wait_event((_ctxt)->wait, \ -+ bch2_moving_ctxt_next_pending_write(_ctxt) || \ -+ (cond_finished = (_cond))); \ -+ if (cond_finished) \ -+ break; \ -+} while (1) -+ -+typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, -+ struct bch_io_opts *, struct data_update_opts *); -+ -+void bch2_moving_ctxt_exit(struct moving_context *); -+void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, -+ struct bch_ratelimit *, struct bch_move_stats *, -+ struct write_point_specifier, bool); -+struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); -+void bch2_moving_ctxt_do_pending_writes(struct moving_context *, -+ struct btree_trans *); -+ -+int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); -+ -+int bch2_move_data(struct bch_fs *, -+ enum btree_id, struct bpos, -+ enum btree_id, struct bpos, -+ struct bch_ratelimit *, -+ struct bch_move_stats *, -+ struct write_point_specifier, -+ bool, -+ move_pred_fn, void *); -+ -+int __bch2_evacuate_bucket(struct btree_trans *, -+ struct moving_context *, -+ struct move_bucket_in_flight *, -+ struct bpos, int, -+ struct data_update_opts); -+int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, -+ struct data_update_opts, -+ struct bch_ratelimit *, -+ struct bch_move_stats *, -+ struct write_point_specifier, -+ bool); -+int bch2_data_job(struct bch_fs *, -+ struct bch_move_stats *, -+ struct bch_ioctl_data); -+ -+void bch2_move_stats_init(struct bch_move_stats *stats, char *name); -+void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_move_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_MOVE_H */ -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -new file mode 100644 -index 000000000..baf1f8570 ---- /dev/null -+++ b/fs/bcachefs/move_types.h -@@ -0,0 +1,36 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_TYPES_H -+#define _BCACHEFS_MOVE_TYPES_H -+ -+struct bch_move_stats { -+ enum bch_data_type data_type; -+ enum btree_id btree_id; -+ struct bpos pos; -+ struct list_head list; -+ char name[32]; -+ -+ atomic64_t keys_moved; -+ atomic64_t keys_raced; -+ atomic64_t sectors_moved; -+ atomic64_t sectors_seen; -+ atomic64_t sectors_raced; -+}; -+ -+struct move_bucket_key { -+ struct bpos bucket; -+ u8 gen; -+}; -+ -+struct move_bucket { -+ struct move_bucket_key k; -+ unsigned sectors; -+}; -+ -+struct move_bucket_in_flight { -+ struct move_bucket_in_flight *next; -+ struct rhash_head hash; -+ struct move_bucket bucket; -+ atomic_t count; -+}; -+ -+#endif /* _BCACHEFS_MOVE_TYPES_H */ -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -new file mode 100644 -index 000000000..256431a6d ---- /dev/null -+++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,423 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Moving/copying garbage collector -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "btree_write_buffer.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "errcode.h" -+#include "error.h" -+#include "extents.h" -+#include "eytzinger.h" -+#include "io.h" -+#include "keylist.h" -+#include "lru.h" -+#include "move.h" -+#include "movinggc.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+struct buckets_in_flight { -+ struct rhashtable table; -+ struct move_bucket_in_flight *first; -+ struct move_bucket_in_flight *last; -+ size_t nr; -+ size_t sectors; -+}; -+ -+static const struct rhashtable_params bch_move_bucket_params = { -+ .head_offset = offsetof(struct move_bucket_in_flight, hash), -+ .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), -+ .key_len = sizeof(struct move_bucket_key), -+}; -+ -+static struct move_bucket_in_flight * -+move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) -+{ -+ struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); -+ int ret; -+ -+ if (!new) -+ return ERR_PTR(-ENOMEM); -+ -+ new->bucket = b; -+ -+ ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, -+ bch_move_bucket_params); -+ if (ret) { -+ kfree(new); -+ return ERR_PTR(ret); -+ } -+ -+ if (!list->first) -+ list->first = new; -+ else -+ list->last->next = new; -+ -+ list->last = new; -+ list->nr++; -+ list->sectors += b.sectors; -+ return new; -+} -+ -+static int bch2_bucket_is_movable(struct btree_trans *trans, -+ struct move_bucket *b, u64 time) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_alloc_v4 _a; -+ const struct bch_alloc_v4 *a; -+ int ret; -+ -+ if (bch2_bucket_is_open(trans->c, -+ b->k.bucket.inode, -+ b->k.bucket.offset)) -+ return 0; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, -+ b->k.bucket, BTREE_ITER_CACHED); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ a = bch2_alloc_to_v4(k, &_a); -+ b->k.gen = a->gen; -+ b->sectors = a->dirty_sectors; -+ -+ ret = data_type_movable(a->data_type) && -+ a->fragmentation_lru && -+ a->fragmentation_lru <= time; -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static void move_buckets_wait(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct buckets_in_flight *list, -+ bool flush) -+{ -+ struct move_bucket_in_flight *i; -+ int ret; -+ -+ while ((i = list->first)) { -+ if (flush) -+ move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); -+ -+ if (atomic_read(&i->count)) -+ break; -+ -+ list->first = i->next; -+ if (!list->first) -+ list->last = NULL; -+ -+ list->nr--; -+ list->sectors -= i->bucket.sectors; -+ -+ ret = rhashtable_remove_fast(&list->table, &i->hash, -+ bch_move_bucket_params); -+ BUG_ON(ret); -+ kfree(i); -+ } -+ -+ bch2_trans_unlock(trans); -+} -+ -+static bool bucket_in_flight(struct buckets_in_flight *list, -+ struct move_bucket_key k) -+{ -+ return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); -+} -+ -+typedef DARRAY(struct move_bucket) move_buckets; -+ -+static int bch2_copygc_get_buckets(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct buckets_in_flight *buckets_in_flight, -+ move_buckets *buckets) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); -+ size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; -+ int ret; -+ -+ move_buckets_wait(trans, ctxt, buckets_in_flight, false); -+ -+ ret = bch2_btree_write_buffer_flush(trans); -+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", -+ __func__, bch2_err_str(ret))) -+ return ret; -+ -+ ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, -+ lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), -+ lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), -+ 0, k, ({ -+ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; -+ int ret = 0; -+ -+ saw++; -+ -+ if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) -+ not_movable++; -+ else if (bucket_in_flight(buckets_in_flight, b.k)) -+ in_flight++; -+ else { -+ ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; -+ if (ret >= 0) -+ sectors += b.sectors; -+ } -+ ret; -+ })); -+ -+ pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", -+ buckets_in_flight->nr, buckets_in_flight->sectors, -+ saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); -+ -+ return ret < 0 ? ret : 0; -+} -+ -+noinline -+static int bch2_copygc(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct buckets_in_flight *buckets_in_flight) -+{ -+ struct bch_fs *c = trans->c; -+ struct data_update_opts data_opts = { -+ .btree_insert_flags = BCH_WATERMARK_copygc, -+ }; -+ move_buckets buckets = { 0 }; -+ struct move_bucket_in_flight *f; -+ struct move_bucket *i; -+ u64 moved = atomic64_read(&ctxt->stats->sectors_moved); -+ int ret = 0; -+ -+ ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); -+ if (ret) -+ goto err; -+ -+ darray_for_each(buckets, i) { -+ if (unlikely(freezing(current))) -+ break; -+ -+ f = move_bucket_in_flight_add(buckets_in_flight, *i); -+ ret = PTR_ERR_OR_ZERO(f); -+ if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ -+ ret = 0; -+ continue; -+ } -+ if (ret == -ENOMEM) { /* flush IO, continue later */ -+ ret = 0; -+ break; -+ } -+ -+ ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, -+ f->bucket.k.gen, data_opts); -+ if (ret) -+ goto err; -+ } -+err: -+ darray_exit(&buckets); -+ -+ /* no entries in LRU btree found, or got to end: */ -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = 0; -+ -+ if (ret < 0 && !bch2_err_matches(ret, EROFS)) -+ bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); -+ -+ moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; -+ trace_and_count(c, copygc, c, moved, 0, 0, 0); -+ return ret; -+} -+ -+/* -+ * Copygc runs when the amount of fragmented data is above some arbitrary -+ * threshold: -+ * -+ * The threshold at the limit - when the device is full - is the amount of space -+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of -+ * disk space stranded due to fragmentation and store everything we have -+ * promised to store. -+ * -+ * But we don't want to be running copygc unnecessarily when the device still -+ * has plenty of free space - rather, we want copygc to smoothly run every so -+ * often and continually reduce the amount of fragmented space as the device -+ * fills up. So, we increase the threshold by half the current free space. -+ */ -+unsigned long bch2_copygc_wait_amount(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ s64 wait = S64_MAX, fragmented_allowed, fragmented; -+ unsigned i; -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ struct bch_dev_usage usage = bch2_dev_usage_read(ca); -+ -+ fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * -+ ca->mi.bucket_size) >> 1); -+ fragmented = 0; -+ -+ for (i = 0; i < BCH_DATA_NR; i++) -+ if (data_type_movable(i)) -+ fragmented += usage.d[i].fragmented; -+ -+ wait = min(wait, max(0LL, fragmented_allowed - fragmented)); -+ } -+ -+ return wait; -+} -+ -+void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ prt_printf(out, "Currently waiting for: "); -+ prt_human_readable_u64(out, max(0LL, c->copygc_wait - -+ atomic64_read(&c->io_clock[WRITE].now)) << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "Currently waiting since: "); -+ prt_human_readable_u64(out, max(0LL, -+ atomic64_read(&c->io_clock[WRITE].now) - -+ c->copygc_wait_at) << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "Currently calculated wait: "); -+ prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); -+ prt_newline(out); -+} -+ -+static int bch2_copygc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct btree_trans trans; -+ struct moving_context ctxt; -+ struct bch_move_stats move_stats; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct buckets_in_flight move_buckets; -+ u64 last, wait; -+ int ret = 0; -+ -+ memset(&move_buckets, 0, sizeof(move_buckets)); -+ -+ ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); -+ if (ret) { -+ bch_err(c, "error allocating copygc buckets in flight: %s", -+ bch2_err_str(ret)); -+ return ret; -+ } -+ -+ set_freezable(); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_move_stats_init(&move_stats, "copygc"); -+ bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, -+ writepoint_ptr(&c->copygc_write_point), -+ false); -+ -+ while (!ret && !kthread_should_stop()) { -+ bch2_trans_unlock(&trans); -+ cond_resched(); -+ -+ if (!c->copy_gc_enabled) { -+ move_buckets_wait(&trans, &ctxt, &move_buckets, true); -+ kthread_wait_freezable(c->copy_gc_enabled); -+ } -+ -+ if (unlikely(freezing(current))) { -+ move_buckets_wait(&trans, &ctxt, &move_buckets, true); -+ __refrigerator(false); -+ continue; -+ } -+ -+ last = atomic64_read(&clock->now); -+ wait = bch2_copygc_wait_amount(c); -+ -+ if (wait > clock->max_slop) { -+ c->copygc_wait_at = last; -+ c->copygc_wait = last + wait; -+ move_buckets_wait(&trans, &ctxt, &move_buckets, true); -+ trace_and_count(c, copygc_wait, c, wait, last + wait); -+ bch2_kthread_io_clock_wait(clock, last + wait, -+ MAX_SCHEDULE_TIMEOUT); -+ continue; -+ } -+ -+ c->copygc_wait = 0; -+ -+ c->copygc_running = true; -+ ret = bch2_copygc(&trans, &ctxt, &move_buckets); -+ c->copygc_running = false; -+ -+ wake_up(&c->copygc_running_wq); -+ } -+ -+ move_buckets_wait(&trans, &ctxt, &move_buckets, true); -+ rhashtable_destroy(&move_buckets.table); -+ bch2_trans_exit(&trans); -+ bch2_moving_ctxt_exit(&ctxt); -+ -+ return 0; -+} -+ -+void bch2_copygc_stop(struct bch_fs *c) -+{ -+ if (c->copygc_thread) { -+ kthread_stop(c->copygc_thread); -+ put_task_struct(c->copygc_thread); -+ } -+ c->copygc_thread = NULL; -+} -+ -+int bch2_copygc_start(struct bch_fs *c) -+{ -+ struct task_struct *t; -+ int ret; -+ -+ if (c->copygc_thread) -+ return 0; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ if (bch2_fs_init_fault("copygc_start")) -+ return -ENOMEM; -+ -+ t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); -+ ret = PTR_ERR_OR_ZERO(t); -+ if (ret) { -+ bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ get_task_struct(t); -+ -+ c->copygc_thread = t; -+ wake_up_process(c->copygc_thread); -+ -+ return 0; -+} -+ -+void bch2_fs_copygc_init(struct bch_fs *c) -+{ -+ init_waitqueue_head(&c->copygc_running_wq); -+ c->copygc_running = false; -+} -diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h -new file mode 100644 -index 000000000..ea181fef5 ---- /dev/null -+++ b/fs/bcachefs/movinggc.h -@@ -0,0 +1,12 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVINGGC_H -+#define _BCACHEFS_MOVINGGC_H -+ -+unsigned long bch2_copygc_wait_amount(struct bch_fs *); -+void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_copygc_stop(struct bch_fs *); -+int bch2_copygc_start(struct bch_fs *); -+void bch2_fs_copygc_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_MOVINGGC_H */ -diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c -new file mode 100644 -index 000000000..396357cd8 ---- /dev/null -+++ b/fs/bcachefs/nocow_locking.c -@@ -0,0 +1,123 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "nocow_locking.h" -+#include "util.h" -+ -+#include -+ -+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) -+{ -+ u64 dev_bucket = bucket_to_u64(bucket); -+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(l->b); i++) -+ if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) -+ return true; -+ return false; -+} -+ -+#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) -+ -+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) -+{ -+ u64 dev_bucket = bucket_to_u64(bucket); -+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); -+ int lock_val = flags ? 1 : -1; -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(l->b); i++) -+ if (l->b[i] == dev_bucket) { -+ BUG_ON(sign(atomic_read(&l->l[i])) != lock_val); -+ -+ if (!atomic_sub_return(lock_val, &l->l[i])) -+ closure_wake_up(&l->wait); -+ return; -+ } -+ -+ BUG(); -+} -+ -+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, -+ u64 dev_bucket, int flags) -+{ -+ int v, lock_val = flags ? 1 : -1; -+ unsigned i; -+ -+ spin_lock(&l->lock); -+ -+ for (i = 0; i < ARRAY_SIZE(l->b); i++) -+ if (l->b[i] == dev_bucket) -+ goto got_entry; -+ -+ for (i = 0; i < ARRAY_SIZE(l->b); i++) -+ if (!atomic_read(&l->l[i])) { -+ l->b[i] = dev_bucket; -+ goto take_lock; -+ } -+fail: -+ spin_unlock(&l->lock); -+ return false; -+got_entry: -+ v = atomic_read(&l->l[i]); -+ if (lock_val > 0 ? v < 0 : v > 0) -+ goto fail; -+take_lock: -+ atomic_add(lock_val, &l->l[i]); -+ spin_unlock(&l->lock); -+ return true; -+} -+ -+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, -+ struct nocow_lock_bucket *l, -+ u64 dev_bucket, int flags) -+{ -+ if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { -+ struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); -+ u64 start_time = local_clock(); -+ -+ __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); -+ bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); -+ } -+} -+ -+void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) -+{ -+ unsigned i, nr_zero = 0; -+ struct nocow_lock_bucket *l; -+ -+ for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { -+ unsigned v = 0; -+ -+ for (i = 0; i < ARRAY_SIZE(l->l); i++) -+ v |= atomic_read(&l->l[i]); -+ -+ if (!v) { -+ nr_zero++; -+ continue; -+ } -+ -+ if (nr_zero) -+ prt_printf(out, "(%u empty entries)\n", nr_zero); -+ nr_zero = 0; -+ -+ for (i = 0; i < ARRAY_SIZE(l->l); i++) -+ if (atomic_read(&l->l[i])) -+ prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i])); -+ prt_newline(out); -+ } -+ -+ if (nr_zero) -+ prt_printf(out, "(%u empty entries)\n", nr_zero); -+} -+ -+int bch2_fs_nocow_locking_init(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) -+ spin_lock_init(&c->nocow_locks.l[i].lock); -+ -+ return 0; -+} -diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h -new file mode 100644 -index 000000000..ff8e4af52 ---- /dev/null -+++ b/fs/bcachefs/nocow_locking.h -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_NOCOW_LOCKING_H -+#define _BCACHEFS_NOCOW_LOCKING_H -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "nocow_locking_types.h" -+ -+#include -+ -+static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, -+ u64 dev_bucket) -+{ -+ unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); -+ -+ return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); -+} -+ -+#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) -+ -+bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); -+void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); -+bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); -+void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, -+ struct nocow_lock_bucket *, u64, int); -+ -+static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, -+ struct bpos bucket, int flags) -+{ -+ u64 dev_bucket = bucket_to_u64(bucket); -+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); -+ -+ __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); -+} -+ -+static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, -+ struct bpos bucket, int flags) -+{ -+ u64 dev_bucket = bucket_to_u64(bucket); -+ struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); -+ -+ return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); -+} -+ -+void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); -+ -+int bch2_fs_nocow_locking_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_NOCOW_LOCKING_H */ -diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h -new file mode 100644 -index 000000000..bd12bf677 ---- /dev/null -+++ b/fs/bcachefs/nocow_locking_types.h -@@ -0,0 +1,20 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H -+#define _BCACHEFS_NOCOW_LOCKING_TYPES_H -+ -+#define BUCKET_NOCOW_LOCKS_BITS 10 -+#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) -+ -+struct nocow_lock_bucket { -+ struct closure_waitlist wait; -+ spinlock_t lock; -+ u64 b[4]; -+ atomic_t l[4]; -+} __aligned(SMP_CACHE_BYTES); -+ -+struct bucket_nocow_lock_table { -+ struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; -+}; -+ -+#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ -+ -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -new file mode 100644 -index 000000000..960bb247f ---- /dev/null -+++ b/fs/bcachefs/opts.c -@@ -0,0 +1,599 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+ -+#include "bcachefs.h" -+#include "compress.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "opts.h" -+#include "super-io.h" -+#include "util.h" -+ -+#define x(t, n, ...) [n] = #t, -+ -+const char * const bch2_error_actions[] = { -+ BCH_ERROR_ACTIONS() -+ NULL -+}; -+ -+const char * const bch2_fsck_fix_opts[] = { -+ BCH_FIX_ERRORS_OPTS() -+ NULL -+}; -+ -+const char * const bch2_version_upgrade_opts[] = { -+ BCH_VERSION_UPGRADE_OPTS() -+ NULL -+}; -+ -+const char * const bch2_sb_features[] = { -+ BCH_SB_FEATURES() -+ NULL -+}; -+ -+const char * const bch2_sb_compat[] = { -+ BCH_SB_COMPAT() -+ NULL -+}; -+ -+const char * const bch2_btree_ids[] = { -+ BCH_BTREE_IDS() -+ "interior btree node", -+ NULL -+}; -+ -+const char * const bch2_csum_types[] = { -+ BCH_CSUM_TYPES() -+ NULL -+}; -+ -+const char * const bch2_csum_opts[] = { -+ BCH_CSUM_OPTS() -+ NULL -+}; -+ -+const char * const bch2_compression_types[] = { -+ BCH_COMPRESSION_TYPES() -+ NULL -+}; -+ -+const char * const bch2_compression_opts[] = { -+ BCH_COMPRESSION_OPTS() -+ NULL -+}; -+ -+const char * const bch2_str_hash_types[] = { -+ BCH_STR_HASH_TYPES() -+ NULL -+}; -+ -+const char * const bch2_str_hash_opts[] = { -+ BCH_STR_HASH_OPTS() -+ NULL -+}; -+ -+const char * const bch2_data_types[] = { -+ BCH_DATA_TYPES() -+ NULL -+}; -+ -+const char * const bch2_member_states[] = { -+ BCH_MEMBER_STATES() -+ NULL -+}; -+ -+const char * const bch2_jset_entry_types[] = { -+ BCH_JSET_ENTRY_TYPES() -+ NULL -+}; -+ -+const char * const bch2_fs_usage_types[] = { -+ BCH_FS_USAGE_TYPES() -+ NULL -+}; -+ -+#undef x -+ -+static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, -+ struct printbuf *err) -+{ -+ if (!val) { -+ *res = FSCK_FIX_yes; -+ } else { -+ int ret = match_string(bch2_fsck_fix_opts, -1, val); -+ -+ if (ret < 0 && err) -+ prt_str(err, "fix_errors: invalid selection"); -+ if (ret < 0) -+ return ret; -+ *res = ret; -+ } -+ -+ return 0; -+} -+ -+static void bch2_opt_fix_errors_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_sb *sb, -+ u64 v) -+{ -+ prt_str(out, bch2_fsck_fix_opts[v]); -+} -+ -+#define bch2_opt_fix_errors (struct bch_opt_fn) { \ -+ .parse = bch2_opt_fix_errors_parse, \ -+ .to_text = bch2_opt_fix_errors_to_text, \ -+} -+ -+const char * const bch2_d_types[BCH_DT_MAX] = { -+ [DT_UNKNOWN] = "unknown", -+ [DT_FIFO] = "fifo", -+ [DT_CHR] = "chr", -+ [DT_DIR] = "dir", -+ [DT_BLK] = "blk", -+ [DT_REG] = "reg", -+ [DT_LNK] = "lnk", -+ [DT_SOCK] = "sock", -+ [DT_WHT] = "whiteout", -+ [DT_SUBVOL] = "subvol", -+}; -+ -+u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) -+{ -+ BUG(); -+} -+ -+void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) -+{ -+ BUG(); -+} -+ -+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -+{ -+#define x(_name, ...) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ -+ BCH_OPTS() -+#undef x -+} -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opt_defined(*opts, _name); -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opts->_name; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ opt_set(*opts, _name, v); \ -+ break; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+const struct bch_option bch2_opt_table[] = { -+#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 -+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ -+ .min = _min, .max = _max -+#define OPT_STR(_choices) .type = BCH_OPT_STR, \ -+ .min = 0, .max = ARRAY_SIZE(_choices), \ -+ .choices = _choices -+#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn -+ -+#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ -+ [Opt_##_name] = { \ -+ .attr = { \ -+ .name = #_name, \ -+ .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ -+ }, \ -+ .flags = _flags, \ -+ .hint = _hint, \ -+ .help = _help, \ -+ .get_sb = _sb_opt, \ -+ .set_sb = SET_##_sb_opt, \ -+ _type \ -+ }, -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+int bch2_opt_lookup(const char *name) -+{ -+ const struct bch_option *i; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); -+ i++) -+ if (!strcmp(name, i->attr.name)) -+ return i - bch2_opt_table; -+ -+ return -1; -+} -+ -+struct synonym { -+ const char *s1, *s2; -+}; -+ -+static const struct synonym bch_opt_synonyms[] = { -+ { "quota", "usrquota" }, -+}; -+ -+static int bch2_mount_opt_lookup(const char *name) -+{ -+ const struct synonym *i; -+ -+ for (i = bch_opt_synonyms; -+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); -+ i++) -+ if (!strcmp(name, i->s1)) -+ name = i->s2; -+ -+ return bch2_opt_lookup(name); -+} -+ -+int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) -+{ -+ if (v < opt->min) { -+ if (err) -+ prt_printf(err, "%s: too small (min %llu)", -+ opt->attr.name, opt->min); -+ return -ERANGE; -+ } -+ -+ if (opt->max && v >= opt->max) { -+ if (err) -+ prt_printf(err, "%s: too big (max %llu)", -+ opt->attr.name, opt->max); -+ return -ERANGE; -+ } -+ -+ if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { -+ if (err) -+ prt_printf(err, "%s: not a multiple of 512", -+ opt->attr.name); -+ return -EINVAL; -+ } -+ -+ if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { -+ if (err) -+ prt_printf(err, "%s: must be a power of two", -+ opt->attr.name); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+int bch2_opt_parse(struct bch_fs *c, -+ const struct bch_option *opt, -+ const char *val, u64 *res, -+ struct printbuf *err) -+{ -+ ssize_t ret; -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ if (val) { -+ ret = kstrtou64(val, 10, res); -+ } else { -+ ret = 0; -+ *res = 1; -+ } -+ -+ if (ret < 0 || (*res != 0 && *res != 1)) { -+ if (err) -+ prt_printf(err, "%s: must be bool", opt->attr.name); -+ return ret; -+ } -+ break; -+ case BCH_OPT_UINT: -+ if (!val) { -+ prt_printf(err, "%s: required value", -+ opt->attr.name); -+ return -EINVAL; -+ } -+ -+ ret = opt->flags & OPT_HUMAN_READABLE -+ ? bch2_strtou64_h(val, res) -+ : kstrtou64(val, 10, res); -+ if (ret < 0) { -+ if (err) -+ prt_printf(err, "%s: must be a number", -+ opt->attr.name); -+ return ret; -+ } -+ break; -+ case BCH_OPT_STR: -+ if (!val) { -+ prt_printf(err, "%s: required value", -+ opt->attr.name); -+ return -EINVAL; -+ } -+ -+ ret = match_string(opt->choices, -1, val); -+ if (ret < 0) { -+ if (err) -+ prt_printf(err, "%s: invalid selection", -+ opt->attr.name); -+ return ret; -+ } -+ -+ *res = ret; -+ break; -+ case BCH_OPT_FN: -+ ret = opt->fn.parse(c, val, res, err); -+ if (ret < 0) { -+ if (err) -+ prt_printf(err, "%s: parse error", -+ opt->attr.name); -+ return ret; -+ } -+ } -+ -+ return bch2_opt_validate(opt, *res, err); -+} -+ -+void bch2_opt_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bch_sb *sb, -+ const struct bch_option *opt, u64 v, -+ unsigned flags) -+{ -+ if (flags & OPT_SHOW_MOUNT_STYLE) { -+ if (opt->type == BCH_OPT_BOOL) { -+ prt_printf(out, "%s%s", -+ v ? "" : "no", -+ opt->attr.name); -+ return; -+ } -+ -+ prt_printf(out, "%s=", opt->attr.name); -+ } -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ case BCH_OPT_UINT: -+ if (opt->flags & OPT_HUMAN_READABLE) -+ prt_human_readable_u64(out, v); -+ else -+ prt_printf(out, "%lli", v); -+ break; -+ case BCH_OPT_STR: -+ if (flags & OPT_SHOW_FULL_LIST) -+ prt_string_option(out, opt->choices, v); -+ else -+ prt_str(out, opt->choices[v]); -+ break; -+ case BCH_OPT_FN: -+ opt->fn.to_text(out, c, sb, v); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) -+{ -+ int ret = 0; -+ -+ switch (id) { -+ case Opt_compression: -+ case Opt_background_compression: -+ ret = bch2_check_set_has_compressed_data(c, v); -+ break; -+ case Opt_erasure_code: -+ if (v) -+ bch2_check_set_feature(c, BCH_FEATURE_ec); -+ break; -+ } -+ -+ return ret; -+} -+ -+int bch2_opts_check_may_set(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ ret = bch2_opt_check_may_set(c, i, -+ bch2_opt_get_by_id(&c->opts, i)); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, -+ char *options) -+{ -+ char *copied_opts, *copied_opts_start; -+ char *opt, *name, *val; -+ int ret, id; -+ struct printbuf err = PRINTBUF; -+ u64 v; -+ -+ if (!options) -+ return 0; -+ -+ /* -+ * sys_fsconfig() is now occasionally providing us with option lists -+ * starting with a comma - weird. -+ */ -+ if (*options == ',') -+ options++; -+ -+ copied_opts = kstrdup(options, GFP_KERNEL); -+ if (!copied_opts) -+ return -1; -+ copied_opts_start = copied_opts; -+ -+ while ((opt = strsep(&copied_opts, ",")) != NULL) { -+ name = strsep(&opt, "="); -+ val = opt; -+ -+ id = bch2_mount_opt_lookup(name); -+ -+ /* Check for the form "noopt", negation of a boolean opt: */ -+ if (id < 0 && -+ !val && -+ !strncmp("no", name, 2)) { -+ id = bch2_mount_opt_lookup(name + 2); -+ val = "0"; -+ } -+ -+ if (id < 0) -+ goto bad_opt; -+ -+ if (!(bch2_opt_table[id].flags & OPT_MOUNT)) -+ goto bad_opt; -+ -+ if (id == Opt_acl && -+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) -+ goto bad_opt; -+ -+ if ((id == Opt_usrquota || -+ id == Opt_grpquota) && -+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) -+ goto bad_opt; -+ -+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); -+ if (ret < 0) -+ goto bad_val; -+ -+ bch2_opt_set_by_id(opts, id, v); -+ } -+ -+ ret = 0; -+ goto out; -+ -+bad_opt: -+ pr_err("Bad mount option %s", name); -+ ret = -1; -+ goto out; -+bad_val: -+ pr_err("Invalid mount option %s", err.buf); -+ ret = -1; -+ goto out; -+out: -+ kfree(copied_opts_start); -+ printbuf_exit(&err); -+ return ret; -+} -+ -+u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) -+{ -+ const struct bch_option *opt = bch2_opt_table + id; -+ u64 v; -+ -+ v = opt->get_sb(sb); -+ -+ if (opt->flags & OPT_SB_FIELD_ILOG2) -+ v = 1ULL << v; -+ -+ if (opt->flags & OPT_SB_FIELD_SECTORS) -+ v <<= 9; -+ -+ return v; -+} -+ -+/* -+ * Initial options from superblock - here we don't want any options undefined, -+ * any options the superblock doesn't specify are set to 0: -+ */ -+int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) -+{ -+ unsigned id; -+ -+ for (id = 0; id < bch2_opts_nr; id++) { -+ const struct bch_option *opt = bch2_opt_table + id; -+ -+ if (opt->get_sb == BCH2_NO_SB_OPT) -+ continue; -+ -+ bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); -+ } -+ -+ return 0; -+} -+ -+void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) -+{ -+ if (opt->set_sb == SET_BCH2_NO_SB_OPT) -+ return; -+ -+ if (opt->flags & OPT_SB_FIELD_SECTORS) -+ v >>= 9; -+ -+ if (opt->flags & OPT_SB_FIELD_ILOG2) -+ v = ilog2(v); -+ -+ opt->set_sb(sb, v); -+} -+ -+void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) -+{ -+ if (opt->set_sb == SET_BCH2_NO_SB_OPT) -+ return; -+ -+ mutex_lock(&c->sb_lock); -+ __bch2_opt_set_sb(c->disk_sb.sb, opt, v); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+} -+ -+/* io opts: */ -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -+{ -+ return (struct bch_io_opts) { -+#define x(_name, _bits) ._name = src._name, -+ BCH_INODE_OPTS() -+#undef x -+ }; -+} -+ -+bool bch2_opt_is_inode_opt(enum bch_opt_id id) -+{ -+ static const enum bch_opt_id inode_opt_list[] = { -+#define x(_name, _bits) Opt_##_name, -+ BCH_INODE_OPTS() -+#undef x -+ }; -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) -+ if (inode_opt_list[i] == id) -+ return true; -+ -+ return false; -+} -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -new file mode 100644 -index 000000000..8a9db110d ---- /dev/null -+++ b/fs/bcachefs/opts.h -@@ -0,0 +1,563 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_OPTS_H -+#define _BCACHEFS_OPTS_H -+ -+#include -+#include -+#include -+#include -+#include "bcachefs_format.h" -+ -+struct bch_fs; -+ -+extern const char * const bch2_error_actions[]; -+extern const char * const bch2_fsck_fix_opts[]; -+extern const char * const bch2_version_upgrade_opts[]; -+extern const char * const bch2_sb_features[]; -+extern const char * const bch2_sb_compat[]; -+extern const char * const bch2_btree_ids[]; -+extern const char * const bch2_csum_types[]; -+extern const char * const bch2_csum_opts[]; -+extern const char * const bch2_compression_types[]; -+extern const char * const bch2_compression_opts[]; -+extern const char * const bch2_str_hash_types[]; -+extern const char * const bch2_str_hash_opts[]; -+extern const char * const bch2_data_types[]; -+extern const char * const bch2_member_states[]; -+extern const char * const bch2_jset_entry_types[]; -+extern const char * const bch2_fs_usage_types[]; -+extern const char * const bch2_d_types[]; -+ -+static inline const char *bch2_d_type_str(unsigned d_type) -+{ -+ return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; -+} -+ -+/* -+ * Mount options; we also store defaults in the superblock. -+ * -+ * Also exposed via sysfs: if an option is writeable, and it's also stored in -+ * the superblock, changing it via sysfs (currently? might change this) also -+ * updates the superblock. -+ * -+ * We store options as signed integers, where -1 means undefined. This means we -+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only -+ * apply the options from that struct that are defined. -+ */ -+ -+/* dummy option, for options that aren't stored in the superblock */ -+u64 BCH2_NO_SB_OPT(const struct bch_sb *); -+void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); -+ -+/* When can be set: */ -+enum opt_flags { -+ OPT_FS = (1 << 0), /* Filesystem option */ -+ OPT_DEVICE = (1 << 1), /* Device option */ -+ OPT_INODE = (1 << 2), /* Inode option */ -+ OPT_FORMAT = (1 << 3), /* May be specified at format time */ -+ OPT_MOUNT = (1 << 4), /* May be specified at mount time */ -+ OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ -+ OPT_HUMAN_READABLE = (1 << 6), -+ OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ -+ OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ -+ OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ -+}; -+ -+enum opt_type { -+ BCH_OPT_BOOL, -+ BCH_OPT_UINT, -+ BCH_OPT_STR, -+ BCH_OPT_FN, -+}; -+ -+struct bch_opt_fn { -+ int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); -+}; -+ -+/** -+ * x(name, shortopt, type, in mem type, mode, sb_opt) -+ * -+ * @name - name of mount option, sysfs attribute, and struct bch_opts -+ * member -+ * -+ * @mode - when opt may be set -+ * -+ * @sb_option - name of corresponding superblock option -+ * -+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR -+ */ -+ -+/* -+ * XXX: add fields for -+ * - default value -+ * - helptext -+ */ -+ -+#ifdef __KERNEL__ -+#define RATELIMIT_ERRORS_DEFAULT true -+#else -+#define RATELIMIT_ERRORS_DEFAULT false -+#endif -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCACHEFS_VERBOSE_DEFAULT true -+#else -+#define BCACHEFS_VERBOSE_DEFAULT false -+#endif -+ -+#define BCH_FIX_ERRORS_OPTS() \ -+ x(exit, 0) \ -+ x(yes, 1) \ -+ x(no, 2) \ -+ x(ask, 3) -+ -+enum fsck_err_opts { -+#define x(t, n) FSCK_FIX_##t, -+ BCH_FIX_ERRORS_OPTS() -+#undef x -+}; -+ -+#define BCH_OPTS() \ -+ x(block_size, u16, \ -+ OPT_FS|OPT_FORMAT| \ -+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ -+ OPT_UINT(512, 1U << 16), \ -+ BCH_SB_BLOCK_SIZE, 8, \ -+ "size", NULL) \ -+ x(btree_node_size, u32, \ -+ OPT_FS|OPT_FORMAT| \ -+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ -+ OPT_UINT(512, 1U << 20), \ -+ BCH_SB_BTREE_NODE_SIZE, 512, \ -+ "size", "Btree node size, default 256k") \ -+ x(errors, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_error_actions), \ -+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ -+ NULL, "Action to take on filesystem error") \ -+ x(metadata_replicas, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_WANT, 1, \ -+ "#", "Number of metadata replicas") \ -+ x(data_replicas, u8, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_WANT, 1, \ -+ "#", "Number of data replicas") \ -+ x(metadata_replicas_required, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(data_replicas_required, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(encoded_extent_max, u32, \ -+ OPT_FS|OPT_FORMAT| \ -+ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ -+ OPT_UINT(4096, 2U << 20), \ -+ BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ -+ "size", "Maximum size of checksummed/compressed extents")\ -+ x(metadata_checksum, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ -+ NULL, NULL) \ -+ x(data_checksum, u8, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ -+ NULL, NULL) \ -+ x(compression, u8, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_compression), \ -+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(background_compression, u8, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_compression), \ -+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(str_hash, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_str_hash_opts), \ -+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ -+ NULL, "Hash function for directory entries and xattrs")\ -+ x(metadata_target, u16, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_METADATA_TARGET, 0, \ -+ "(target)", "Device or label for metadata writes") \ -+ x(foreground_target, u16, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_FOREGROUND_TARGET, 0, \ -+ "(target)", "Device or label for foreground writes") \ -+ x(background_target, u16, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_BACKGROUND_TARGET, 0, \ -+ "(target)", "Device or label to move data to in the background")\ -+ x(promote_target, u16, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_PROMOTE_TARGET, 0, \ -+ "(target)", "Device or label to promote data to on read") \ -+ x(erasure_code, u16, \ -+ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_ERASURE_CODE, false, \ -+ NULL, "Enable erasure coding (DO NOT USE YET)") \ -+ x(inodes_32bit, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_INODE_32BIT, true, \ -+ NULL, "Constrain inode numbers to 32 bits") \ -+ x(shard_inode_numbers, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_SHARD_INUMS, true, \ -+ NULL, "Shard new inode numbers by CPU id") \ -+ x(inodes_use_key_cache, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_INODES_USE_KEY_CACHE, true, \ -+ NULL, "Use the btree key cache for the inodes btree") \ -+ x(btree_node_mem_ptr_optimization, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Stash pointer to in memory btree node in btree ptr")\ -+ x(btree_write_buffer_size, u32, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_UINT(16, (1U << 20) - 1), \ -+ BCH2_NO_SB_OPT, 1U << 13, \ -+ NULL, "Number of btree write buffer entries") \ -+ x(gc_reserve_percent, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(5, 21), \ -+ BCH_SB_GC_RESERVE, 8, \ -+ "%", "Percentage of disk space to reserve for copygc")\ -+ x(gc_reserve_bytes, u64, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ -+ OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ -+ OPT_UINT(0, U64_MAX), \ -+ BCH_SB_GC_RESERVE_BYTES, 0, \ -+ "%", "Amount of disk space to reserve for copygc\n" \ -+ "Takes precedence over gc_reserve_percent if set")\ -+ x(root_reserve_percent, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(0, 100), \ -+ BCH_SB_ROOT_RESERVE, 0, \ -+ "%", "Percentage of disk space to reserve for superuser")\ -+ x(wide_macs, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_128_BIT_MACS, false, \ -+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ -+ x(inline_data, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable inline data extents") \ -+ x(acl, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_POSIX_ACL, true, \ -+ NULL, "Enable POSIX acls") \ -+ x(usrquota, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_USRQUOTA, false, \ -+ NULL, "Enable user quotas") \ -+ x(grpquota, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_GRPQUOTA, false, \ -+ NULL, "Enable group quotas") \ -+ x(prjquota, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_PRJQUOTA, false, \ -+ NULL, "Enable project quotas") \ -+ x(degraded, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Allow mounting in degraded mode") \ -+ x(very_degraded, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Allow mounting in when data will be missing") \ -+ x(discard, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable discard/TRIM support") \ -+ x(verbose, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ -+ NULL, "Extra debugging information during mount/recovery")\ -+ x(journal_flush_delay, u32, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, U32_MAX), \ -+ BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ -+ NULL, "Delay in milliseconds before automatic journal commits")\ -+ x(journal_flush_disabled, u8, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ -+ NULL, "Disable journal flush on sync/fsync\n" \ -+ "If enabled, writes can be lost, but only since the\n"\ -+ "last journal write (default 1 second)") \ -+ x(journal_reclaim_delay, u32, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(0, U32_MAX), \ -+ BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ -+ NULL, "Delay in milliseconds before automatic journal reclaim")\ -+ x(move_bytes_in_flight, u32, \ -+ OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1024, U32_MAX), \ -+ BCH2_NO_SB_OPT, 1U << 20, \ -+ NULL, "Maximum Amount of IO to keep in flight by the move path")\ -+ x(move_ios_in_flight, u32, \ -+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, 1024), \ -+ BCH2_NO_SB_OPT, 32, \ -+ NULL, "Maximum number of IOs to keep in flight by the move path")\ -+ x(fsck, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Run fsck on mount") \ -+ x(fix_errors, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_FN(bch2_opt_fix_errors), \ -+ BCH2_NO_SB_OPT, FSCK_FIX_exit, \ -+ NULL, "Fix errors during fsck without asking") \ -+ x(ratelimit_errors, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ -+ NULL, "Ratelimit error messages during fsck") \ -+ x(nochanges, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Super read only mode - no writes at all will be issued,\n"\ -+ "even if we have to replay the journal") \ -+ x(norecovery, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Don't replay the journal") \ -+ x(keep_journal, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Don't free journal entries/keys after startup")\ -+ x(read_entire_journal, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Read all journal entries, not just dirty ones")\ -+ x(read_journal_only, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Only read the journal, skip the rest of recovery")\ -+ x(journal_transaction_names, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ -+ NULL, "Log transaction function names in journal") \ -+ x(noexcl, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Don't open device in exclusive mode") \ -+ x(direct_io, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Use O_DIRECT (userspace only)") \ -+ x(sb, u64, \ -+ OPT_MOUNT, \ -+ OPT_UINT(0, S64_MAX), \ -+ BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ -+ "offset", "Sector offset of superblock") \ -+ x(read_only, u8, \ -+ OPT_FS, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(nostart, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Don\'t start filesystem, only open devices") \ -+ x(reconstruct_alloc, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Reconstruct alloc btree") \ -+ x(version_upgrade, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_STR(bch2_version_upgrade_opts), \ -+ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ -+ NULL, "Set superblock to latest version,\n" \ -+ "allowing any new features to be used") \ -+ x(buckets_nouse, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Allocate the buckets_nouse bitmap") \ -+ x(project, u8, \ -+ OPT_INODE, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(nocow, u8, \ -+ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_BOOL(), \ -+ BCH_SB_NOCOW, false, \ -+ NULL, "Nocow mode: Writes will be done in place when possible.\n"\ -+ "Snapshots and reflink will still caused writes to be COW\n"\ -+ "Implicitly disables data checksumming, compression and encryption")\ -+ x(nocow_enabled, u8, \ -+ OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, true, \ -+ NULL, "Enable nocow mode: enables runtime locking in\n"\ -+ "data move path needed if nocow will ever be in use\n")\ -+ x(no_data_io, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ -+ NULL, "Skip submit_bio() for data reads and writes, " \ -+ "for performance testing purposes") \ -+ x(fs_size, u64, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, S64_MAX), \ -+ BCH2_NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(bucket, u32, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, S64_MAX), \ -+ BCH2_NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(durability, u8, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, BCH_REPLICAS_MAX), \ -+ BCH2_NO_SB_OPT, 1, \ -+ "n", "Data written to this device will be considered\n"\ -+ "to have already been replicated n times") -+ -+struct bch_opts { -+#define x(_name, _bits, ...) unsigned _name##_defined:1; -+ BCH_OPTS() -+#undef x -+ -+#define x(_name, _bits, ...) _bits _name; -+ BCH_OPTS() -+#undef x -+}; -+ -+static const struct bch_opts bch2_opts_default = { -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ -+ ._name##_defined = true, \ -+ ._name = _default, \ -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+#define opt_defined(_opts, _name) ((_opts)._name##_defined) -+ -+#define opt_get(_opts, _name) \ -+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) -+ -+#define opt_set(_opts, _name, _v) \ -+do { \ -+ (_opts)._name##_defined = true; \ -+ (_opts)._name = _v; \ -+} while (0) -+ -+static inline struct bch_opts bch2_opts_empty(void) -+{ -+ return (struct bch_opts) { 0 }; -+} -+ -+void bch2_opts_apply(struct bch_opts *, struct bch_opts); -+ -+enum bch_opt_id { -+#define x(_name, ...) Opt_##_name, -+ BCH_OPTS() -+#undef x -+ bch2_opts_nr -+}; -+ -+struct bch_fs; -+struct printbuf; -+ -+struct bch_option { -+ struct attribute attr; -+ u64 (*get_sb)(const struct bch_sb *); -+ void (*set_sb)(struct bch_sb *, u64); -+ enum opt_type type; -+ enum opt_flags flags; -+ u64 min, max; -+ -+ const char * const *choices; -+ -+ struct bch_opt_fn fn; -+ -+ const char *hint; -+ const char *help; -+ -+}; -+ -+extern const struct bch_option bch2_opt_table[]; -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -+ -+u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); -+int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -+void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); -+void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); -+ -+int bch2_opt_lookup(const char *); -+int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); -+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, -+ const char *, u64 *, struct printbuf *); -+ -+#define OPT_SHOW_FULL_LIST (1 << 0) -+#define OPT_SHOW_MOUNT_STYLE (1 << 1) -+ -+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, -+ const struct bch_option *, u64, unsigned); -+ -+int bch2_opt_check_may_set(struct bch_fs *, int, u64); -+int bch2_opts_check_may_set(struct bch_fs *); -+int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); -+ -+/* inode opts: */ -+ -+struct bch_io_opts { -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_OPTS() -+#undef x -+}; -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -+bool bch2_opt_is_inode_opt(enum bch_opt_id); -+ -+#endif /* _BCACHEFS_OPTS_H */ -diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c -new file mode 100644 -index 000000000..c41daa180 ---- /dev/null -+++ b/fs/bcachefs/printbuf.c -@@ -0,0 +1,415 @@ -+// SPDX-License-Identifier: LGPL-2.1+ -+/* Copyright (C) 2022 Kent Overstreet */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "printbuf.h" -+ -+static inline unsigned printbuf_linelen(struct printbuf *buf) -+{ -+ return buf->pos - buf->last_newline; -+} -+ -+int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) -+{ -+ unsigned new_size; -+ char *buf; -+ -+ if (!out->heap_allocated) -+ return 0; -+ -+ /* Reserved space for terminating nul: */ -+ extra += 1; -+ -+ if (out->pos + extra < out->size) -+ return 0; -+ -+ new_size = roundup_pow_of_two(out->size + extra); -+ -+ /* -+ * Note: output buffer must be freeable with kfree(), it's not required -+ * that the user use printbuf_exit(). -+ */ -+ buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); -+ -+ if (!buf) { -+ out->allocation_failure = true; -+ return -ENOMEM; -+ } -+ -+ out->buf = buf; -+ out->size = new_size; -+ return 0; -+} -+ -+void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) -+{ -+ int len; -+ -+ do { -+ va_list args2; -+ -+ va_copy(args2, args); -+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); -+ } while (len + 1 >= printbuf_remaining(out) && -+ !bch2_printbuf_make_room(out, len + 1)); -+ -+ len = min_t(size_t, len, -+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); -+ out->pos += len; -+} -+ -+void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) -+{ -+ va_list args; -+ int len; -+ -+ do { -+ va_start(args, fmt); -+ len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); -+ va_end(args); -+ } while (len + 1 >= printbuf_remaining(out) && -+ !bch2_printbuf_make_room(out, len + 1)); -+ -+ len = min_t(size_t, len, -+ printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); -+ out->pos += len; -+} -+ -+/** -+ * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null -+ * terminated -+ */ -+const char *bch2_printbuf_str(const struct printbuf *buf) -+{ -+ /* -+ * If we've written to a printbuf then it's guaranteed to be a null -+ * terminated string - but if we haven't, then we might not have -+ * allocated a buffer at all: -+ */ -+ return buf->pos -+ ? buf->buf -+ : ""; -+} -+ -+/** -+ * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it -+ * against accidental use. -+ */ -+void bch2_printbuf_exit(struct printbuf *buf) -+{ -+ if (buf->heap_allocated) { -+ kfree(buf->buf); -+ buf->buf = ERR_PTR(-EINTR); /* poison value */ -+ } -+} -+ -+void bch2_printbuf_tabstops_reset(struct printbuf *buf) -+{ -+ buf->nr_tabstops = 0; -+} -+ -+void bch2_printbuf_tabstop_pop(struct printbuf *buf) -+{ -+ if (buf->nr_tabstops) -+ --buf->nr_tabstops; -+} -+ -+/* -+ * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop -+ * -+ * @buf: printbuf to control -+ * @spaces: number of spaces from previous tabpstop -+ * -+ * In the future this function may allocate memory if setting more than -+ * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start -+ * of line. -+ */ -+int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) -+{ -+ unsigned prev_tabstop = buf->nr_tabstops -+ ? buf->_tabstops[buf->nr_tabstops - 1] -+ : 0; -+ -+ if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) -+ return -EINVAL; -+ -+ buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; -+ buf->has_indent_or_tabstops = true; -+ return 0; -+} -+ -+/** -+ * printbuf_indent_add - add to the current indent level -+ * -+ * @buf: printbuf to control -+ * @spaces: number of spaces to add to the current indent level -+ * -+ * Subsequent lines, and the current line if the output position is at the start -+ * of the current line, will be indented by @spaces more spaces. -+ */ -+void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) -+{ -+ if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) -+ spaces = 0; -+ -+ buf->indent += spaces; -+ prt_chars(buf, ' ', spaces); -+ -+ buf->has_indent_or_tabstops = true; -+} -+ -+/** -+ * printbuf_indent_sub - subtract from the current indent level -+ * -+ * @buf: printbuf to control -+ * @spaces: number of spaces to subtract from the current indent level -+ * -+ * Subsequent lines, and the current line if the output position is at the start -+ * of the current line, will be indented by @spaces less spaces. -+ */ -+void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) -+{ -+ if (WARN_ON_ONCE(spaces > buf->indent)) -+ spaces = buf->indent; -+ -+ if (buf->last_newline + buf->indent == buf->pos) { -+ buf->pos -= spaces; -+ printbuf_nul_terminate(buf); -+ } -+ buf->indent -= spaces; -+ -+ if (!buf->indent && !buf->nr_tabstops) -+ buf->has_indent_or_tabstops = false; -+} -+ -+void bch2_prt_newline(struct printbuf *buf) -+{ -+ unsigned i; -+ -+ bch2_printbuf_make_room(buf, 1 + buf->indent); -+ -+ __prt_char(buf, '\n'); -+ -+ buf->last_newline = buf->pos; -+ -+ for (i = 0; i < buf->indent; i++) -+ __prt_char(buf, ' '); -+ -+ printbuf_nul_terminate(buf); -+ -+ buf->last_field = buf->pos; -+ buf->cur_tabstop = 0; -+} -+ -+/* -+ * Returns spaces from start of line, if set, or 0 if unset: -+ */ -+static inline unsigned cur_tabstop(struct printbuf *buf) -+{ -+ return buf->cur_tabstop < buf->nr_tabstops -+ ? buf->_tabstops[buf->cur_tabstop] -+ : 0; -+} -+ -+static void __prt_tab(struct printbuf *out) -+{ -+ int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); -+ -+ prt_chars(out, ' ', spaces); -+ -+ out->last_field = out->pos; -+ out->cur_tabstop++; -+} -+ -+/** -+ * prt_tab - Advance printbuf to the next tabstop -+ * -+ * @buf: printbuf to control -+ * -+ * Advance output to the next tabstop by printing spaces. -+ */ -+void bch2_prt_tab(struct printbuf *out) -+{ -+ if (WARN_ON(!cur_tabstop(out))) -+ return; -+ -+ __prt_tab(out); -+} -+ -+static void __prt_tab_rjust(struct printbuf *buf) -+{ -+ unsigned move = buf->pos - buf->last_field; -+ int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); -+ -+ if (pad > 0) { -+ bch2_printbuf_make_room(buf, pad); -+ -+ if (buf->last_field + pad < buf->size) -+ memmove(buf->buf + buf->last_field + pad, -+ buf->buf + buf->last_field, -+ min(move, buf->size - 1 - buf->last_field - pad)); -+ -+ if (buf->last_field < buf->size) -+ memset(buf->buf + buf->last_field, ' ', -+ min((unsigned) pad, buf->size - buf->last_field)); -+ -+ buf->pos += pad; -+ printbuf_nul_terminate(buf); -+ } -+ -+ buf->last_field = buf->pos; -+ buf->cur_tabstop++; -+} -+ -+/** -+ * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying -+ * previous output -+ * -+ * @buf: printbuf to control -+ * -+ * Advance output to the next tabstop by inserting spaces immediately after the -+ * previous tabstop, right justifying previously outputted text. -+ */ -+void bch2_prt_tab_rjust(struct printbuf *buf) -+{ -+ if (WARN_ON(!cur_tabstop(buf))) -+ return; -+ -+ __prt_tab_rjust(buf); -+} -+ -+/** -+ * prt_bytes_indented - Print an array of chars, handling embedded control characters -+ * -+ * @out: printbuf to output to -+ * @str: string to print -+ * @count: number of bytes to print -+ * -+ * The following contol characters are handled as so: -+ * \n: prt_newline newline that obeys current indent level -+ * \t: prt_tab advance to next tabstop -+ * \r: prt_tab_rjust advance to next tabstop, with right justification -+ */ -+void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) -+{ -+ const char *unprinted_start = str; -+ const char *end = str + count; -+ -+ if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { -+ prt_bytes(out, str, count); -+ return; -+ } -+ -+ while (str != end) { -+ switch (*str) { -+ case '\n': -+ prt_bytes(out, unprinted_start, str - unprinted_start); -+ unprinted_start = str + 1; -+ bch2_prt_newline(out); -+ break; -+ case '\t': -+ if (likely(cur_tabstop(out))) { -+ prt_bytes(out, unprinted_start, str - unprinted_start); -+ unprinted_start = str + 1; -+ __prt_tab(out); -+ } -+ break; -+ case '\r': -+ if (likely(cur_tabstop(out))) { -+ prt_bytes(out, unprinted_start, str - unprinted_start); -+ unprinted_start = str + 1; -+ __prt_tab_rjust(out); -+ } -+ break; -+ } -+ -+ str++; -+ } -+ -+ prt_bytes(out, unprinted_start, str - unprinted_start); -+} -+ -+/** -+ * prt_human_readable_u64 - Print out a u64 in human readable units -+ * -+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units -+ */ -+void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) -+{ -+ bch2_printbuf_make_room(buf, 10); -+ buf->pos += string_get_size(v, 1, !buf->si_units, -+ buf->buf + buf->pos, -+ printbuf_remaining_size(buf)); -+} -+ -+/** -+ * prt_human_readable_s64 - Print out a s64 in human readable units -+ * -+ * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units -+ */ -+void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) -+{ -+ if (v < 0) -+ prt_char(buf, '-'); -+ bch2_prt_human_readable_u64(buf, abs(v)); -+} -+ -+/** -+ * prt_units_u64 - Print out a u64 according to printbuf unit options -+ * -+ * Units are either raw (default), or human reabable units (controlled via -+ * @buf->human_readable_units) -+ */ -+void bch2_prt_units_u64(struct printbuf *out, u64 v) -+{ -+ if (out->human_readable_units) -+ bch2_prt_human_readable_u64(out, v); -+ else -+ bch2_prt_printf(out, "%llu", v); -+} -+ -+/** -+ * prt_units_s64 - Print out a s64 according to printbuf unit options -+ * -+ * Units are either raw (default), or human reabable units (controlled via -+ * @buf->human_readable_units) -+ */ -+void bch2_prt_units_s64(struct printbuf *out, s64 v) -+{ -+ if (v < 0) -+ prt_char(out, '-'); -+ bch2_prt_units_u64(out, abs(v)); -+} -+ -+void bch2_prt_string_option(struct printbuf *out, -+ const char * const list[], -+ size_t selected) -+{ -+ size_t i; -+ -+ for (i = 0; list[i]; i++) -+ bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); -+} -+ -+void bch2_prt_bitflags(struct printbuf *out, -+ const char * const list[], u64 flags) -+{ -+ unsigned bit, nr = 0; -+ bool first = true; -+ -+ while (list[nr]) -+ nr++; -+ -+ while (flags && (bit = __ffs(flags)) < nr) { -+ if (!first) -+ bch2_prt_printf(out, ","); -+ first = false; -+ bch2_prt_printf(out, "%s", list[bit]); -+ flags ^= 1 << bit; -+ } -+} -diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h -new file mode 100644 -index 000000000..2191423d9 ---- /dev/null -+++ b/fs/bcachefs/printbuf.h -@@ -0,0 +1,284 @@ -+/* SPDX-License-Identifier: LGPL-2.1+ */ -+/* Copyright (C) 2022 Kent Overstreet */ -+ -+#ifndef _BCACHEFS_PRINTBUF_H -+#define _BCACHEFS_PRINTBUF_H -+ -+/* -+ * Printbufs: Simple strings for printing to, with optional heap allocation -+ * -+ * This code has provisions for use in userspace, to aid in making other code -+ * portable between kernelspace and userspace. -+ * -+ * Basic example: -+ * struct printbuf buf = PRINTBUF; -+ * -+ * prt_printf(&buf, "foo="); -+ * foo_to_text(&buf, foo); -+ * printk("%s", buf.buf); -+ * printbuf_exit(&buf); -+ * -+ * Or -+ * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) -+ * -+ * We can now write pretty printers instead of writing code that dumps -+ * everything to the kernel log buffer, and then those pretty-printers can be -+ * used by other code that outputs to kernel log, sysfs, debugfs, etc. -+ * -+ * Memory allocation: Outputing to a printbuf may allocate memory. This -+ * allocation is done with GFP_KERNEL, by default: use the newer -+ * memalloc_*_(save|restore) functions as needed. -+ * -+ * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations -+ * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. -+ * -+ * It's allowed to grab the output buffer and free it later with kfree() instead -+ * of using printbuf_exit(), if the user just needs a heap allocated string at -+ * the end. -+ * -+ * Memory allocation failures: We don't return errors directly, because on -+ * memory allocation failure we usually don't want to bail out and unwind - we -+ * want to print what we've got, on a best-effort basis. But code that does want -+ * to return -ENOMEM may check printbuf.allocation_failure. -+ * -+ * Indenting, tabstops: -+ * -+ * To aid is writing multi-line pretty printers spread across multiple -+ * functions, printbufs track the current indent level. -+ * -+ * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent -+ * level, respectively. -+ * -+ * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from -+ * start of line. Once set, prt_tab() will output spaces up to the next tabstop. -+ * prt_tab_rjust() will also advance the current line of text up to the next -+ * tabstop, but it does so by shifting text since the previous tabstop up to the -+ * next tabstop - right justifying it. -+ * -+ * Make sure you use prt_newline() instead of \n in the format string for indent -+ * level and tabstops to work corretly. -+ * -+ * Output units: printbuf->units exists to tell pretty-printers how to output -+ * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as -+ * human readable bytes. prt_units() obeys it. -+ */ -+ -+#include -+#include -+ -+enum printbuf_si { -+ PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ -+ PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ -+}; -+ -+#define PRINTBUF_INLINE_TABSTOPS 6 -+ -+struct printbuf { -+ char *buf; -+ unsigned size; -+ unsigned pos; -+ unsigned last_newline; -+ unsigned last_field; -+ unsigned indent; -+ /* -+ * If nonzero, allocations will be done with GFP_ATOMIC: -+ */ -+ u8 atomic; -+ bool allocation_failure:1; -+ bool heap_allocated:1; -+ enum printbuf_si si_units:1; -+ bool human_readable_units:1; -+ bool has_indent_or_tabstops:1; -+ bool suppress_indent_tabstop_handling:1; -+ u8 nr_tabstops; -+ -+ /* -+ * Do not modify directly: use printbuf_tabstop_add(), -+ * printbuf_tabstop_get() -+ */ -+ u8 cur_tabstop; -+ u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; -+}; -+ -+int bch2_printbuf_make_room(struct printbuf *, unsigned); -+__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); -+__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); -+const char *bch2_printbuf_str(const struct printbuf *); -+void bch2_printbuf_exit(struct printbuf *); -+ -+void bch2_printbuf_tabstops_reset(struct printbuf *); -+void bch2_printbuf_tabstop_pop(struct printbuf *); -+int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); -+ -+void bch2_printbuf_indent_add(struct printbuf *, unsigned); -+void bch2_printbuf_indent_sub(struct printbuf *, unsigned); -+ -+void bch2_prt_newline(struct printbuf *); -+void bch2_prt_tab(struct printbuf *); -+void bch2_prt_tab_rjust(struct printbuf *); -+ -+void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); -+void bch2_prt_human_readable_u64(struct printbuf *, u64); -+void bch2_prt_human_readable_s64(struct printbuf *, s64); -+void bch2_prt_units_u64(struct printbuf *, u64); -+void bch2_prt_units_s64(struct printbuf *, s64); -+void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); -+void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); -+ -+/* Initializer for a heap allocated printbuf: */ -+#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) -+ -+/* Initializer a printbuf that points to an external buffer: */ -+#define PRINTBUF_EXTERN(_buf, _size) \ -+((struct printbuf) { \ -+ .buf = _buf, \ -+ .size = _size, \ -+}) -+ -+/* -+ * Returns size remaining of output buffer: -+ */ -+static inline unsigned printbuf_remaining_size(struct printbuf *out) -+{ -+ return out->pos < out->size ? out->size - out->pos : 0; -+} -+ -+/* -+ * Returns number of characters we can print to the output buffer - i.e. -+ * excluding the terminating nul: -+ */ -+static inline unsigned printbuf_remaining(struct printbuf *out) -+{ -+ return out->pos < out->size ? out->size - out->pos - 1 : 0; -+} -+ -+static inline unsigned printbuf_written(struct printbuf *out) -+{ -+ return out->size ? min(out->pos, out->size - 1) : 0; -+} -+ -+/* -+ * Returns true if output was truncated: -+ */ -+static inline bool printbuf_overflowed(struct printbuf *out) -+{ -+ return out->pos >= out->size; -+} -+ -+static inline void printbuf_nul_terminate(struct printbuf *out) -+{ -+ bch2_printbuf_make_room(out, 1); -+ -+ if (out->pos < out->size) -+ out->buf[out->pos] = 0; -+ else if (out->size) -+ out->buf[out->size - 1] = 0; -+} -+ -+/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ -+static inline void __prt_char_reserved(struct printbuf *out, char c) -+{ -+ if (printbuf_remaining(out)) -+ out->buf[out->pos] = c; -+ out->pos++; -+} -+ -+/* Doesn't nul terminate: */ -+static inline void __prt_char(struct printbuf *out, char c) -+{ -+ bch2_printbuf_make_room(out, 1); -+ __prt_char_reserved(out, c); -+} -+ -+static inline void prt_char(struct printbuf *out, char c) -+{ -+ __prt_char(out, c); -+ printbuf_nul_terminate(out); -+} -+ -+static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) -+{ -+ unsigned i, can_print = min(n, printbuf_remaining(out)); -+ -+ for (i = 0; i < can_print; i++) -+ out->buf[out->pos++] = c; -+ out->pos += n - can_print; -+} -+ -+static inline void prt_chars(struct printbuf *out, char c, unsigned n) -+{ -+ bch2_printbuf_make_room(out, n); -+ __prt_chars_reserved(out, c, n); -+ printbuf_nul_terminate(out); -+} -+ -+static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) -+{ -+ unsigned i, can_print; -+ -+ bch2_printbuf_make_room(out, n); -+ -+ can_print = min(n, printbuf_remaining(out)); -+ -+ for (i = 0; i < can_print; i++) -+ out->buf[out->pos++] = ((char *) b)[i]; -+ out->pos += n - can_print; -+ -+ printbuf_nul_terminate(out); -+} -+ -+static inline void prt_str(struct printbuf *out, const char *str) -+{ -+ prt_bytes(out, str, strlen(str)); -+} -+ -+static inline void prt_str_indented(struct printbuf *out, const char *str) -+{ -+ bch2_prt_bytes_indented(out, str, strlen(str)); -+} -+ -+static inline void prt_hex_byte(struct printbuf *out, u8 byte) -+{ -+ bch2_printbuf_make_room(out, 2); -+ __prt_char_reserved(out, hex_asc_hi(byte)); -+ __prt_char_reserved(out, hex_asc_lo(byte)); -+ printbuf_nul_terminate(out); -+} -+ -+static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) -+{ -+ bch2_printbuf_make_room(out, 2); -+ __prt_char_reserved(out, hex_asc_upper_hi(byte)); -+ __prt_char_reserved(out, hex_asc_upper_lo(byte)); -+ printbuf_nul_terminate(out); -+} -+ -+/** -+ * printbuf_reset - re-use a printbuf without freeing and re-initializing it: -+ */ -+static inline void printbuf_reset(struct printbuf *buf) -+{ -+ buf->pos = 0; -+ buf->allocation_failure = 0; -+ buf->indent = 0; -+ buf->nr_tabstops = 0; -+ buf->cur_tabstop = 0; -+} -+ -+/** -+ * printbuf_atomic_inc - mark as entering an atomic section -+ */ -+static inline void printbuf_atomic_inc(struct printbuf *buf) -+{ -+ buf->atomic++; -+} -+ -+/** -+ * printbuf_atomic_inc - mark as leaving an atomic section -+ */ -+static inline void printbuf_atomic_dec(struct printbuf *buf) -+{ -+ buf->atomic--; -+} -+ -+#endif /* _BCACHEFS_PRINTBUF_H */ -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -new file mode 100644 -index 000000000..ca99772ae ---- /dev/null -+++ b/fs/bcachefs/quota.c -@@ -0,0 +1,981 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "errcode.h" -+#include "error.h" -+#include "inode.h" -+#include "quota.h" -+#include "snapshot.h" -+#include "super-io.h" -+ -+static const char * const bch2_quota_types[] = { -+ "user", -+ "group", -+ "project", -+}; -+ -+static const char * const bch2_quota_counters[] = { -+ "space", -+ "inodes", -+}; -+ -+static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_quota *q = field_to_type(f, quota); -+ -+ if (vstruct_bytes(&q->field) < sizeof(*q)) { -+ prt_printf(err, "wrong size (got %zu should be %zu)", -+ vstruct_bytes(&q->field), sizeof(*q)); -+ return -BCH_ERR_invalid_sb_quota; -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_quota *q = field_to_type(f, quota); -+ unsigned qtyp, counter; -+ -+ for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { -+ prt_printf(out, "%s: flags %llx", -+ bch2_quota_types[qtyp], -+ le64_to_cpu(q->q[qtyp].flags)); -+ -+ for (counter = 0; counter < Q_COUNTERS; counter++) -+ prt_printf(out, " %s timelimit %u warnlimit %u", -+ bch2_quota_counters[counter], -+ le32_to_cpu(q->q[qtyp].c[counter].timelimit), -+ le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); -+ -+ prt_newline(out); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_quota = { -+ .validate = bch2_sb_quota_validate, -+ .to_text = bch2_sb_quota_to_text, -+}; -+ -+int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (k.k->p.inode >= QTYP_NR) { -+ prt_printf(err, "invalid quota type (%llu >= %u)", -+ k.k->p.inode, QTYP_NR); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); -+ unsigned i; -+ -+ for (i = 0; i < Q_COUNTERS; i++) -+ prt_printf(out, "%s hardlimit %llu softlimit %llu", -+ bch2_quota_counters[i], -+ le64_to_cpu(dq.v->c[i].hardlimit), -+ le64_to_cpu(dq.v->c[i].softlimit)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+#include -+#include -+#include -+ -+static void qc_info_to_text(struct printbuf *out, struct qc_info *i) -+{ -+ printbuf_tabstops_reset(out); -+ printbuf_tabstop_push(out, 20); -+ -+ prt_str(out, "i_fieldmask"); -+ prt_tab(out); -+ prt_printf(out, "%x", i->i_fieldmask); -+ prt_newline(out); -+ -+ prt_str(out, "i_flags"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_flags); -+ prt_newline(out); -+ -+ prt_str(out, "i_spc_timelimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_spc_timelimit); -+ prt_newline(out); -+ -+ prt_str(out, "i_ino_timelimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_ino_timelimit); -+ prt_newline(out); -+ -+ prt_str(out, "i_rt_spc_timelimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_rt_spc_timelimit); -+ prt_newline(out); -+ -+ prt_str(out, "i_spc_warnlimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_spc_warnlimit); -+ prt_newline(out); -+ -+ prt_str(out, "i_ino_warnlimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_ino_warnlimit); -+ prt_newline(out); -+ -+ prt_str(out, "i_rt_spc_warnlimit"); -+ prt_tab(out); -+ prt_printf(out, "%u", i->i_rt_spc_warnlimit); -+ prt_newline(out); -+} -+ -+static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) -+{ -+ printbuf_tabstops_reset(out); -+ printbuf_tabstop_push(out, 20); -+ -+ prt_str(out, "d_fieldmask"); -+ prt_tab(out); -+ prt_printf(out, "%x", q->d_fieldmask); -+ prt_newline(out); -+ -+ prt_str(out, "d_spc_hardlimit"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_spc_hardlimit); -+ prt_newline(out); -+ -+ prt_str(out, "d_spc_softlimit"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_spc_softlimit); -+ prt_newline(out); -+ -+ prt_str(out, "d_ino_hardlimit"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_ino_hardlimit); -+ prt_newline(out); -+ -+ prt_str(out, "d_ino_softlimit"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_ino_softlimit); -+ prt_newline(out); -+ -+ prt_str(out, "d_space"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_space); -+ prt_newline(out); -+ -+ prt_str(out, "d_ino_count"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_ino_count); -+ prt_newline(out); -+ -+ prt_str(out, "d_ino_timer"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_ino_timer); -+ prt_newline(out); -+ -+ prt_str(out, "d_spc_timer"); -+ prt_tab(out); -+ prt_printf(out, "%llu", q->d_spc_timer); -+ prt_newline(out); -+ -+ prt_str(out, "d_ino_warns"); -+ prt_tab(out); -+ prt_printf(out, "%i", q->d_ino_warns); -+ prt_newline(out); -+ -+ prt_str(out, "d_spc_warns"); -+ prt_tab(out); -+ prt_printf(out, "%i", q->d_spc_warns); -+ prt_newline(out); -+} -+ -+static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -+{ -+ qtypes >>= i; -+ return qtypes ? i + __ffs(qtypes) : QTYP_NR; -+} -+ -+#define for_each_set_qtype(_c, _i, _q, _qtypes) \ -+ for (_i = 0; \ -+ (_i = __next_qtype(_i, _qtypes), \ -+ _q = &(_c)->quotas[_i], \ -+ _i < QTYP_NR); \ -+ _i++) -+ -+static bool ignore_hardlimit(struct bch_memquota_type *q) -+{ -+ if (capable(CAP_SYS_RESOURCE)) -+ return true; -+#if 0 -+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; -+ -+ return capable(CAP_SYS_RESOURCE) && -+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || -+ !(info->dqi_flags & DQF_ROOT_SQUASH)); -+#endif -+ return false; -+} -+ -+enum quota_msg { -+ SOFTWARN, /* Softlimit reached */ -+ SOFTLONGWARN, /* Grace time expired */ -+ HARDWARN, /* Hardlimit reached */ -+ -+ HARDBELOW, /* Usage got below inode hardlimit */ -+ SOFTBELOW, /* Usage got below inode softlimit */ -+}; -+ -+static int quota_nl[][Q_COUNTERS] = { -+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, -+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, -+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, -+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, -+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, -+ -+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, -+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, -+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, -+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, -+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -+}; -+ -+struct quota_msgs { -+ u8 nr; -+ struct { -+ u8 qtype; -+ u8 msg; -+ } m[QTYP_NR * Q_COUNTERS]; -+}; -+ -+static void prepare_msg(unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); -+ -+ msgs->m[msgs->nr].qtype = qtype; -+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; -+ msgs->nr++; -+} -+ -+static void prepare_warning(struct memquota_counter *qc, -+ unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ if (qc->warning_issued & (1 << msg_type)) -+ return; -+ -+ prepare_msg(qtype, counter, msgs, msg_type); -+} -+ -+static void flush_warnings(struct bch_qid qid, -+ struct super_block *sb, -+ struct quota_msgs *msgs) -+{ -+ unsigned i; -+ -+ for (i = 0; i < msgs->nr; i++) -+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), -+ sb->s_dev, msgs->m[i].msg); -+} -+ -+static int bch2_quota_check_limit(struct bch_fs *c, -+ unsigned qtype, -+ struct bch_memquota *mq, -+ struct quota_msgs *msgs, -+ enum quota_counters counter, -+ s64 v, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q = &c->quotas[qtype]; -+ struct memquota_counter *qc = &mq->c[counter]; -+ u64 n = qc->v + v; -+ -+ BUG_ON((s64) n < 0); -+ -+ if (mode == KEY_TYPE_QUOTA_NOCHECK) -+ return 0; -+ -+ if (v <= 0) { -+ if (n < qc->hardlimit && -+ (qc->warning_issued & (1 << HARDWARN))) { -+ qc->warning_issued &= ~(1 << HARDWARN); -+ prepare_msg(qtype, counter, msgs, HARDBELOW); -+ } -+ -+ if (n < qc->softlimit && -+ (qc->warning_issued & (1 << SOFTWARN))) { -+ qc->warning_issued &= ~(1 << SOFTWARN); -+ prepare_msg(qtype, counter, msgs, SOFTBELOW); -+ } -+ -+ qc->warning_issued = 0; -+ return 0; -+ } -+ -+ if (qc->hardlimit && -+ qc->hardlimit < n && -+ !ignore_hardlimit(q)) { -+ prepare_warning(qc, qtype, counter, msgs, HARDWARN); -+ return -EDQUOT; -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n) { -+ if (qc->timer == 0) { -+ qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; -+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); -+ } else if (ktime_get_real_seconds() >= qc->timer && -+ !ignore_hardlimit(q)) { -+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); -+ return -EDQUOT; -+ } -+ } -+ -+ return 0; -+} -+ -+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ unsigned qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq[QTYP_NR]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); -+ if (!mq[i]) -+ return -ENOMEM; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mq[i]->c[counter].v += v; -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(qid, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static void __bch2_quota_transfer(struct bch_memquota *src_q, -+ struct bch_memquota *dst_q, -+ enum quota_counters counter, s64 v) -+{ -+ BUG_ON(v > src_q->c[counter].v); -+ BUG_ON(v + dst_q->c[counter].v < v); -+ -+ src_q->c[counter].v -= v; -+ dst_q->c[counter].v += v; -+} -+ -+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q; -+ struct bch_memquota *src_q[3], *dst_q[3]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); -+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); -+ if (!src_q[i] || !dst_q[i]) -+ return -ENOMEM; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, -+ dst_q[i]->c[Q_SPC].v + space, -+ mode); -+ if (ret) -+ goto err; -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, -+ dst_q[i]->c[Q_INO].v + 1, -+ mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); -+ } -+ -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(dst, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, -+ struct qc_dqblk *qdq) -+{ -+ struct bkey_s_c_quota dq; -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq; -+ unsigned i; -+ -+ BUG_ON(k.k->p.inode >= QTYP_NR); -+ -+ if (!((1U << k.k->p.inode) & enabled_qtypes(c))) -+ return 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_quota: -+ dq = bkey_s_c_to_quota(k); -+ q = &c->quotas[k.k->p.inode]; -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); -+ if (!mq) { -+ mutex_unlock(&q->lock); -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < Q_COUNTERS; i++) { -+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); -+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); -+ } -+ -+ if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) -+ mq->c[Q_SPC].timer = qdq->d_spc_timer; -+ if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) -+ mq->c[Q_SPC].warns = qdq->d_spc_warns; -+ if (qdq && qdq->d_fieldmask & QC_INO_TIMER) -+ mq->c[Q_INO].timer = qdq->d_ino_timer; -+ if (qdq && qdq->d_fieldmask & QC_INO_WARNS) -+ mq->c[Q_INO].warns = qdq->d_ino_warns; -+ -+ mutex_unlock(&q->lock); -+ } -+ -+ return 0; -+} -+ -+void bch2_fs_quota_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ genradix_free(&c->quotas[i].table); -+} -+ -+void bch2_fs_quota_init(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ mutex_init(&c->quotas[i].lock); -+} -+ -+static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) -+{ -+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); -+ -+ if (sb_quota) -+ return sb_quota; -+ -+ sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); -+ if (sb_quota) { -+ unsigned qtype, qc; -+ -+ for (qtype = 0; qtype < QTYP_NR; qtype++) -+ for (qc = 0; qc < Q_COUNTERS; qc++) -+ sb_quota->q[qtype].c[qc].timelimit = -+ cpu_to_le32(7 * 24 * 60 * 60); -+ } -+ -+ return sb_quota; -+} -+ -+static void bch2_sb_quota_read(struct bch_fs *c) -+{ -+ struct bch_sb_field_quota *sb_quota; -+ unsigned i, j; -+ -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) -+ return; -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ struct bch_memquota_type *q = &c->quotas[i]; -+ -+ for (j = 0; j < Q_COUNTERS; j++) { -+ q->limits[j].timelimit = -+ le32_to_cpu(sb_quota->q[i].c[j].timelimit); -+ q->limits[j].warnlimit = -+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); -+ } -+ } -+} -+ -+static int bch2_fs_quota_read_inode(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ struct bch_snapshot_tree s_t; -+ int ret; -+ -+ ret = bch2_snapshot_tree_lookup(trans, -+ bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "%s: snapshot tree %u not found", __func__, -+ snapshot_t(c, k.k->p.snapshot)->tree); -+ if (ret) -+ return ret; -+ -+ if (!s_t.master_subvol) -+ goto advance; -+ -+ ret = bch2_inode_find_by_inum_trans(trans, -+ (subvol_inum) { -+ le32_to_cpu(s_t.master_subvol), -+ k.k->p.offset, -+ }, &u); -+ /* -+ * Inode might be deleted in this snapshot - the easiest way to handle -+ * that is to just skip it here: -+ */ -+ if (bch2_err_matches(ret, ENOENT)) -+ goto advance; -+ -+ if (ret) -+ return ret; -+ -+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, -+ KEY_TYPE_QUOTA_NOCHECK); -+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, -+ KEY_TYPE_QUOTA_NOCHECK); -+advance: -+ bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); -+ return 0; -+} -+ -+int bch2_fs_quota_read(struct bch_fs *c) -+{ -+ struct bch_sb_field_quota *sb_quota; -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); -+ if (!sb_quota) { -+ mutex_unlock(&c->sb_lock); -+ return -BCH_ERR_ENOSPC_sb_quota; -+ } -+ -+ bch2_sb_quota_read(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, -+ POS_MIN, BTREE_ITER_PREFETCH, k, -+ __bch2_quota_set(c, k, NULL)) ?: -+ for_each_btree_key2(&trans, iter, BTREE_ID_inodes, -+ POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ bch2_fs_quota_read_inode(&trans, &iter, k)); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* Enable/disable/delete quotas for an entire filesystem: */ -+ -+static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_sb_field_quota *sb_quota; -+ int ret = 0; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ /* Accounting must be enabled at mount time: */ -+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) -+ return -EINVAL; -+ -+ /* Can't enable enforcement without accounting: */ -+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) -+ return -EINVAL; -+ -+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) -+ return -EINVAL; -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); -+ if (!sb_quota) { -+ ret = -BCH_ERR_ENOSPC_sb_quota; -+ goto unlock; -+ } -+ -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); -+ -+ bch2_write_super(c); -+unlock: -+ mutex_unlock(&c->sb_lock); -+ -+ return bch2_err_class(ret); -+} -+ -+static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (uflags & FS_USER_QUOTA) { -+ if (c->opts.usrquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, -+ POS(QTYP_USR, 0), -+ POS(QTYP_USR, U64_MAX), -+ 0, NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_GROUP_QUOTA) { -+ if (c->opts.grpquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, -+ POS(QTYP_GRP, 0), -+ POS(QTYP_GRP, U64_MAX), -+ 0, NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_PROJ_QUOTA) { -+ if (c->opts.prjquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_quotas, -+ POS(QTYP_PRJ, 0), -+ POS(QTYP_PRJ, U64_MAX), -+ 0, NULL); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Return quota status information, such as enforcements, quota file inode -+ * numbers etc. -+ */ -+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ unsigned qtypes = enabled_qtypes(c); -+ unsigned i; -+ -+ memset(state, 0, sizeof(*state)); -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ state->s_state[i].flags |= QCI_SYSFILE; -+ -+ if (!(qtypes & (1 << i))) -+ continue; -+ -+ state->s_state[i].flags |= QCI_ACCT_ENABLED; -+ -+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; -+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; -+ -+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; -+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Adjust quota timers & warnings -+ */ -+static int bch2_quota_set_info(struct super_block *sb, int type, -+ struct qc_info *info) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_sb_field_quota *sb_quota; -+ struct bch_memquota_type *q; -+ int ret = 0; -+ -+ if (0) { -+ struct printbuf buf = PRINTBUF; -+ -+ qc_info_to_text(&buf, info); -+ pr_info("setting:\n%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (type >= QTYP_NR) -+ return -EINVAL; -+ -+ if (!((1 << type) & enabled_qtypes(c))) -+ return -ESRCH; -+ -+ if (info->i_fieldmask & -+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) -+ return -EINVAL; -+ -+ q = &c->quotas[type]; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); -+ if (!sb_quota) { -+ ret = -BCH_ERR_ENOSPC_sb_quota; -+ goto unlock; -+ } -+ -+ if (info->i_fieldmask & QC_SPC_TIMER) -+ sb_quota->q[type].c[Q_SPC].timelimit = -+ cpu_to_le32(info->i_spc_timelimit); -+ -+ if (info->i_fieldmask & QC_SPC_WARNS) -+ sb_quota->q[type].c[Q_SPC].warnlimit = -+ cpu_to_le32(info->i_spc_warnlimit); -+ -+ if (info->i_fieldmask & QC_INO_TIMER) -+ sb_quota->q[type].c[Q_INO].timelimit = -+ cpu_to_le32(info->i_ino_timelimit); -+ -+ if (info->i_fieldmask & QC_INO_WARNS) -+ sb_quota->q[type].c[Q_INO].warnlimit = -+ cpu_to_le32(info->i_ino_warnlimit); -+ -+ bch2_sb_quota_read(c); -+ -+ bch2_write_super(c); -+unlock: -+ mutex_unlock(&c->sb_lock); -+ -+ return bch2_err_class(ret); -+} -+ -+/* Get/set individual quotas: */ -+ -+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -+{ -+ dst->d_space = src->c[Q_SPC].v << 9; -+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; -+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; -+ dst->d_spc_timer = src->c[Q_SPC].timer; -+ dst->d_spc_warns = src->c[Q_SPC].warns; -+ -+ dst->d_ino_count = src->c[Q_INO].v; -+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; -+ dst->d_ino_softlimit = src->c[Q_INO].softlimit; -+ dst->d_ino_timer = src->c[Q_INO].timer; -+ dst->d_ino_warns = src->c[Q_INO].warns; -+} -+ -+static int bch2_get_quota(struct super_block *sb, struct kqid kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid.type]; -+ qid_t qid = from_kqid(&init_user_ns, kqid); -+ struct bch_memquota *mq; -+ -+ memset(qdq, 0, sizeof(*qdq)); -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr(&q->table, qid); -+ if (mq) -+ __bch2_quota_get(qdq, mq); -+ mutex_unlock(&q->lock); -+ -+ return 0; -+} -+ -+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid->type]; -+ qid_t qid = from_kqid(&init_user_ns, *kqid); -+ struct genradix_iter iter; -+ struct bch_memquota *mq; -+ int ret = 0; -+ -+ mutex_lock(&q->lock); -+ -+ genradix_for_each_from(&q->table, iter, mq, qid) -+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { -+ __bch2_quota_get(qdq, mq); -+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); -+ goto found; -+ } -+ -+ ret = -ENOENT; -+found: -+ mutex_unlock(&q->lock); -+ return bch2_err_class(ret); -+} -+ -+static int bch2_set_quota_trans(struct btree_trans *trans, -+ struct bkey_i_quota *new_quota, -+ struct qc_dqblk *qdq) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ ret = bkey_err(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (k.k->type == KEY_TYPE_quota) -+ new_quota->v = *bkey_s_c_to_quota(k).v; -+ -+ if (qdq->d_fieldmask & QC_SPC_SOFT) -+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); -+ if (qdq->d_fieldmask & QC_SPC_HARD) -+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); -+ -+ if (qdq->d_fieldmask & QC_INO_SOFT) -+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); -+ if (qdq->d_fieldmask & QC_INO_HARD) -+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); -+ -+ ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_set_quota(struct super_block *sb, struct kqid qid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bkey_i_quota new_quota; -+ int ret; -+ -+ if (0) { -+ struct printbuf buf = PRINTBUF; -+ -+ qc_dqblk_to_text(&buf, qdq); -+ pr_info("setting:\n%s", buf.buf); -+ printbuf_exit(&buf); -+ } -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ bkey_quota_init(&new_quota.k_i); -+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); -+ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: -+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); -+ -+ return bch2_err_class(ret); -+} -+ -+const struct quotactl_ops bch2_quotactl_operations = { -+ .quota_enable = bch2_quota_enable, -+ .quota_disable = bch2_quota_disable, -+ .rm_xquota = bch2_quota_remove, -+ -+ .get_state = bch2_quota_get_state, -+ .set_info = bch2_quota_set_info, -+ -+ .get_dqblk = bch2_get_quota, -+ .get_nextdqblk = bch2_get_next_quota, -+ .set_dqblk = bch2_set_quota, -+}; -+ -+#endif /* CONFIG_BCACHEFS_QUOTA */ -diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h -new file mode 100644 -index 000000000..2f463874a ---- /dev/null -+++ b/fs/bcachefs/quota.h -@@ -0,0 +1,74 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_H -+#define _BCACHEFS_QUOTA_H -+ -+#include "inode.h" -+#include "quota_types.h" -+ -+enum bkey_invalid_flags; -+extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -+ -+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_quota ((struct bkey_ops) { \ -+ .key_invalid = bch2_quota_invalid, \ -+ .val_to_text = bch2_quota_to_text, \ -+ .min_val_size = 32, \ -+}) -+ -+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -+{ -+ return (struct bch_qid) { -+ .q[QTYP_USR] = u->bi_uid, -+ .q[QTYP_GRP] = u->bi_gid, -+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, -+ }; -+} -+ -+static inline unsigned enabled_qtypes(struct bch_fs *c) -+{ -+ return ((c->opts.usrquota << QTYP_USR)| -+ (c->opts.grpquota << QTYP_GRP)| -+ (c->opts.prjquota << QTYP_PRJ)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, -+ s64, enum quota_acct_mode); -+ -+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, -+ struct bch_qid, u64, enum quota_acct_mode); -+ -+void bch2_fs_quota_exit(struct bch_fs *); -+void bch2_fs_quota_init(struct bch_fs *); -+int bch2_fs_quota_read(struct bch_fs *); -+ -+extern const struct quotactl_ops bch2_quotactl_operations; -+ -+#else -+ -+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -+static inline void bch2_fs_quota_init(struct bch_fs *c) {} -+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } -+ -+#endif -+ -+#endif /* _BCACHEFS_QUOTA_H */ -diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h -new file mode 100644 -index 000000000..6a136083d ---- /dev/null -+++ b/fs/bcachefs/quota_types.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_TYPES_H -+#define _BCACHEFS_QUOTA_TYPES_H -+ -+#include -+ -+struct bch_qid { -+ u32 q[QTYP_NR]; -+}; -+ -+enum quota_acct_mode { -+ KEY_TYPE_QUOTA_PREALLOC, -+ KEY_TYPE_QUOTA_WARN, -+ KEY_TYPE_QUOTA_NOCHECK, -+}; -+ -+struct memquota_counter { -+ u64 v; -+ u64 hardlimit; -+ u64 softlimit; -+ s64 timer; -+ int warns; -+ int warning_issued; -+}; -+ -+struct bch_memquota { -+ struct memquota_counter c[Q_COUNTERS]; -+}; -+ -+typedef GENRADIX(struct bch_memquota) bch_memquota_table; -+ -+struct quota_limit { -+ u32 timelimit; -+ u32 warnlimit; -+}; -+ -+struct bch_memquota_type { -+ struct quota_limit limits[Q_COUNTERS]; -+ bch_memquota_table table; -+ struct mutex lock; -+}; -+ -+#endif /* _BCACHEFS_QUOTA_TYPES_H */ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -new file mode 100644 -index 000000000..15ce3ecba ---- /dev/null -+++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,368 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "clock.h" -+#include "compress.h" -+#include "disk_groups.h" -+#include "errcode.h" -+#include "extents.h" -+#include "io.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super-io.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+ -+/* -+ * Check if an extent should be moved: -+ * returns -1 if it should not be moved, or -+ * device of pointer that should be moved, if known, or INT_MAX if unknown -+ */ -+static bool rebalance_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_update_opts *data_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ unsigned i; -+ -+ data_opts->rewrite_ptrs = 0; -+ data_opts->target = io_opts->background_target; -+ data_opts->extra_replicas = 0; -+ data_opts->btree_insert_flags = 0; -+ -+ if (io_opts->background_compression && -+ !bch2_bkey_is_incompressible(k)) { -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ i = 0; -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (!p.ptr.cached && -+ p.crc.compression_type != -+ bch2_compression_opt_to_type(io_opts->background_compression)) -+ data_opts->rewrite_ptrs |= 1U << i; -+ i++; -+ } -+ } -+ -+ if (io_opts->background_target) { -+ const struct bch_extent_ptr *ptr; -+ -+ i = 0; -+ bkey_for_each_ptr(ptrs, ptr) { -+ if (!ptr->cached && -+ !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && -+ bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) -+ data_opts->rewrite_ptrs |= 1U << i; -+ i++; -+ } -+ } -+ -+ return data_opts->rewrite_ptrs != 0; -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ struct data_update_opts update_opts = { 0 }; -+ struct bkey_ptrs_c ptrs; -+ const struct bch_extent_ptr *ptr; -+ unsigned i; -+ -+ if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) -+ return; -+ -+ i = 0; -+ ptrs = bch2_bkey_ptrs_c(k); -+ bkey_for_each_ptr(ptrs, ptr) { -+ if ((1U << i) && update_opts.rewrite_ptrs) -+ if (atomic64_add_return(k.k->size, -+ &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == -+ k.k->size) -+ rebalance_wakeup(c); -+ i++; -+ } -+} -+ -+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -+{ -+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == -+ sectors) -+ rebalance_wakeup(c); -+} -+ -+struct rebalance_work { -+ int dev_most_full_idx; -+ unsigned dev_most_full_percent; -+ u64 dev_most_full_work; -+ u64 dev_most_full_capacity; -+ u64 total_work; -+}; -+ -+static void rebalance_work_accumulate(struct rebalance_work *w, -+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) -+{ -+ unsigned percent_full; -+ u64 work = dev_work + unknown_dev; -+ -+ /* avoid divide by 0 */ -+ if (!capacity) -+ return; -+ -+ if (work < dev_work || work < unknown_dev) -+ work = U64_MAX; -+ work = min(work, capacity); -+ -+ percent_full = div64_u64(work * 100, capacity); -+ -+ if (percent_full >= w->dev_most_full_percent) { -+ w->dev_most_full_idx = idx; -+ w->dev_most_full_percent = percent_full; -+ w->dev_most_full_work = work; -+ w->dev_most_full_capacity = capacity; -+ } -+ -+ if (w->total_work + dev_work >= w->total_work && -+ w->total_work + dev_work >= dev_work) -+ w->total_work += dev_work; -+} -+ -+static struct rebalance_work rebalance_work(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct rebalance_work ret = { .dev_most_full_idx = -1 }; -+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ rebalance_work_accumulate(&ret, -+ atomic64_read(&ca->rebalance_work), -+ unknown_dev, -+ bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket), -+ i); -+ -+ rebalance_work_accumulate(&ret, -+ unknown_dev, 0, c->capacity, -1); -+ -+ return ret; -+} -+ -+static void rebalance_work_reset(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ atomic64_set(&ca->rebalance_work, 0); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, 0); -+} -+ -+static unsigned long curr_cputime(void) -+{ -+ u64 utime, stime; -+ -+ task_cputime_adjusted(current, &utime, &stime); -+ return nsecs_to_jiffies(utime + stime); -+} -+ -+static int bch2_rebalance_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct rebalance_work w, p; -+ struct bch_move_stats move_stats; -+ unsigned long start, prev_start; -+ unsigned long prev_run_time, prev_run_cputime; -+ unsigned long cputime, prev_cputime; -+ u64 io_start; -+ long throttle; -+ -+ set_freezable(); -+ -+ io_start = atomic64_read(&clock->now); -+ p = rebalance_work(c); -+ prev_start = jiffies; -+ prev_cputime = curr_cputime(); -+ -+ bch2_move_stats_init(&move_stats, "rebalance"); -+ while (!kthread_wait_freezable(r->enabled)) { -+ cond_resched(); -+ -+ start = jiffies; -+ cputime = curr_cputime(); -+ -+ prev_run_time = start - prev_start; -+ prev_run_cputime = cputime - prev_cputime; -+ -+ w = rebalance_work(c); -+ BUG_ON(!w.dev_most_full_capacity); -+ -+ if (!w.total_work) { -+ r->state = REBALANCE_WAITING; -+ kthread_wait_freezable(rebalance_work(c).total_work); -+ continue; -+ } -+ -+ /* -+ * If there isn't much work to do, throttle cpu usage: -+ */ -+ throttle = prev_run_cputime * 100 / -+ max(1U, w.dev_most_full_percent) - -+ prev_run_time; -+ -+ if (w.dev_most_full_percent < 20 && throttle > 0) { -+ r->throttled_until_iotime = io_start + -+ div_u64(w.dev_most_full_capacity * -+ (20 - w.dev_most_full_percent), -+ 50); -+ -+ if (atomic64_read(&clock->now) + clock->max_slop < -+ r->throttled_until_iotime) { -+ r->throttled_until_cputime = start + throttle; -+ r->state = REBALANCE_THROTTLED; -+ -+ bch2_kthread_io_clock_wait(clock, -+ r->throttled_until_iotime, -+ throttle); -+ continue; -+ } -+ } -+ -+ /* minimum 1 mb/sec: */ -+ r->pd.rate.rate = -+ max_t(u64, 1 << 11, -+ r->pd.rate.rate * -+ max(p.dev_most_full_percent, 1U) / -+ max(w.dev_most_full_percent, 1U)); -+ -+ io_start = atomic64_read(&clock->now); -+ p = w; -+ prev_start = start; -+ prev_cputime = cputime; -+ -+ r->state = REBALANCE_RUNNING; -+ memset(&move_stats, 0, sizeof(move_stats)); -+ rebalance_work_reset(c); -+ -+ bch2_move_data(c, -+ 0, POS_MIN, -+ BTREE_ID_NR, POS_MAX, -+ /* ratelimiting disabled for now */ -+ NULL, /* &r->pd.rate, */ -+ &move_stats, -+ writepoint_ptr(&c->rebalance_write_point), -+ true, -+ rebalance_pred, NULL); -+ } -+ -+ return 0; -+} -+ -+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct rebalance_work w = rebalance_work(c); -+ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 20); -+ -+ prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); -+ prt_tab(out); -+ -+ prt_human_readable_u64(out, w.dev_most_full_work << 9); -+ prt_printf(out, "/"); -+ prt_human_readable_u64(out, w.dev_most_full_capacity << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "total work:"); -+ prt_tab(out); -+ -+ prt_human_readable_u64(out, w.total_work << 9); -+ prt_printf(out, "/"); -+ prt_human_readable_u64(out, c->capacity << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "rate:"); -+ prt_tab(out); -+ prt_printf(out, "%u", r->pd.rate.rate); -+ prt_newline(out); -+ -+ switch (r->state) { -+ case REBALANCE_WAITING: -+ prt_printf(out, "waiting"); -+ break; -+ case REBALANCE_THROTTLED: -+ prt_printf(out, "throttled for %lu sec or ", -+ (r->throttled_until_cputime - jiffies) / HZ); -+ prt_human_readable_u64(out, -+ (r->throttled_until_iotime - -+ atomic64_read(&c->io_clock[WRITE].now)) << 9); -+ prt_printf(out, " io"); -+ break; -+ case REBALANCE_RUNNING: -+ prt_printf(out, "running"); -+ break; -+ } -+ prt_newline(out); -+} -+ -+void bch2_rebalance_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ c->rebalance.pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->rebalance.pd.rate); -+ -+ p = rcu_dereference_protected(c->rebalance.thread, 1); -+ c->rebalance.thread = NULL; -+ -+ if (p) { -+ /* for sychronizing with rebalance_wakeup() */ -+ synchronize_rcu(); -+ -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_rebalance_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ int ret; -+ -+ if (c->rebalance.thread) -+ return 0; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); -+ ret = PTR_ERR_OR_ZERO(p); -+ if (ret) { -+ bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ -+ get_task_struct(p); -+ rcu_assign_pointer(c->rebalance.thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_rebalance_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->rebalance.pd); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); -+} -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -new file mode 100644 -index 000000000..7ade0bb81 ---- /dev/null -+++ b/fs/bcachefs/rebalance.h -@@ -0,0 +1,28 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_H -+#define _BCACHEFS_REBALANCE_H -+ -+#include "rebalance_types.h" -+ -+static inline void rebalance_wakeup(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(c->rebalance.thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_opts *); -+void bch2_rebalance_add_work(struct bch_fs *, u64); -+ -+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_rebalance_stop(struct bch_fs *); -+int bch2_rebalance_start(struct bch_fs *); -+void bch2_fs_rebalance_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REBALANCE_H */ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -new file mode 100644 -index 000000000..7462a92e9 ---- /dev/null -+++ b/fs/bcachefs/rebalance_types.h -@@ -0,0 +1,26 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_TYPES_H -+#define _BCACHEFS_REBALANCE_TYPES_H -+ -+#include "move_types.h" -+ -+enum rebalance_state { -+ REBALANCE_WAITING, -+ REBALANCE_THROTTLED, -+ REBALANCE_RUNNING, -+}; -+ -+struct bch_fs_rebalance { -+ struct task_struct __rcu *thread; -+ struct bch_pd_controller pd; -+ -+ atomic64_t work_unknown_dev; -+ -+ enum rebalance_state state; -+ u64 throttled_until_iotime; -+ unsigned long throttled_until_cputime; -+ -+ unsigned enabled:1; -+}; -+ -+#endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -new file mode 100644 -index 000000000..30efb3c90 ---- /dev/null -+++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1057 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "backpointers.h" -+#include "bkey_buf.h" -+#include "alloc_background.h" -+#include "btree_gc.h" -+#include "btree_journal_iter.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "buckets.h" -+#include "dirent.h" -+#include "ec.h" -+#include "errcode.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "lru.h" -+#include "move.h" -+#include "quota.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "sb-clean.h" -+#include "snapshot.h" -+#include "subvolume.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+/* for -o reconstruct_alloc: */ -+static void drop_alloc_keys(struct journal_keys *keys) -+{ -+ size_t src, dst; -+ -+ for (src = 0, dst = 0; src < keys->nr; src++) -+ if (keys->d[src].btree_id != BTREE_ID_alloc) -+ keys->d[dst++] = keys->d[src]; -+ -+ keys->nr = dst; -+} -+ -+/* -+ * Btree node pointers have a field to stack a pointer to the in memory btree -+ * node; we need to zero out this field when reading in btree nodes, or when -+ * reading in keys from the journal: -+ */ -+static void zero_out_btree_mem_ptr(struct journal_keys *keys) -+{ -+ struct journal_key *i; -+ -+ for (i = keys->d; i < keys->d + keys->nr; i++) -+ if (i->k->k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; -+} -+ -+/* journal replay: */ -+ -+static void replay_now_at(struct journal *j, u64 seq) -+{ -+ BUG_ON(seq < j->replay_journal_seq); -+ -+ seq = min(seq, j->replay_journal_seq_end); -+ -+ while (j->replay_journal_seq < seq) -+ bch2_journal_pin_put(j, j->replay_journal_seq++); -+} -+ -+static int bch2_journal_replay_key(struct btree_trans *trans, -+ struct journal_key *k) -+{ -+ struct btree_iter iter; -+ unsigned iter_flags = -+ BTREE_ITER_INTENT| -+ BTREE_ITER_NOT_EXTENTS; -+ unsigned update_flags = BTREE_TRIGGER_NORUN; -+ int ret; -+ -+ /* -+ * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to -+ * keep the key cache coherent with the underlying btree. Nothing -+ * besides the allocator is doing updates yet so we don't need key cache -+ * coherency for non-alloc btrees, and key cache fills for snapshots -+ * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until -+ * the snapshots recovery pass runs. -+ */ -+ if (!k->level && k->btree_id == BTREE_ID_alloc) -+ iter_flags |= BTREE_ITER_CACHED; -+ else -+ update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; -+ -+ bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, -+ BTREE_MAX_DEPTH, k->level, -+ iter_flags); -+ ret = bch2_btree_iter_traverse(&iter); -+ if (ret) -+ goto out; -+ -+ /* Must be checked with btree locked: */ -+ if (k->overwritten) -+ goto out; -+ -+ ret = bch2_trans_update(trans, &iter, k->k, update_flags); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int journal_sort_seq_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = *((const struct journal_key **)_l); -+ const struct journal_key *r = *((const struct journal_key **)_r); -+ -+ return cmp_int(l->journal_seq, r->journal_seq); -+} -+ -+static int bch2_journal_replay(struct bch_fs *c) -+{ -+ struct journal_keys *keys = &c->journal_keys; -+ struct journal_key **keys_sorted, *k; -+ struct journal *j = &c->journal; -+ u64 start_seq = c->journal_replay_seq_start; -+ u64 end_seq = c->journal_replay_seq_start; -+ size_t i; -+ int ret; -+ -+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); -+ keys->gap = keys->nr; -+ -+ keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL); -+ if (!keys_sorted) -+ return -BCH_ERR_ENOMEM_journal_replay; -+ -+ for (i = 0; i < keys->nr; i++) -+ keys_sorted[i] = &keys->d[i]; -+ -+ sort(keys_sorted, keys->nr, -+ sizeof(keys_sorted[0]), -+ journal_sort_seq_cmp, NULL); -+ -+ if (keys->nr) { -+ ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", -+ keys->nr, start_seq, end_seq); -+ if (ret) -+ goto err; -+ } -+ -+ for (i = 0; i < keys->nr; i++) { -+ k = keys_sorted[i]; -+ -+ cond_resched(); -+ -+ replay_now_at(j, k->journal_seq); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_NOFAIL| -+ (!k->allocated -+ ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim -+ : 0), -+ bch2_journal_replay_key(&trans, k)); -+ if (ret) { -+ bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", -+ bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); -+ goto err; -+ } -+ } -+ -+ replay_now_at(j, j->replay_journal_seq_end); -+ j->replay_journal_seq = 0; -+ -+ bch2_journal_set_replay_done(j); -+ bch2_journal_flush_all_pins(j); -+ ret = bch2_journal_error(j); -+ -+ if (keys->nr && !ret) -+ bch2_journal_log_msg(c, "journal replay finished"); -+err: -+ kvfree(keys_sorted); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* journal replay early: */ -+ -+static int journal_replay_entry_early(struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ int ret = 0; -+ -+ switch (entry->type) { -+ case BCH_JSET_ENTRY_btree_root: { -+ struct btree_root *r; -+ -+ while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { -+ ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); -+ if (ret) -+ return ret; -+ } -+ -+ r = bch2_btree_id_root(c, entry->btree_id); -+ -+ if (entry->u64s) { -+ r->level = entry->level; -+ bkey_copy(&r->key, &entry->start[0]); -+ r->error = 0; -+ } else { -+ r->error = -EIO; -+ } -+ r->alive = true; -+ break; -+ } -+ case BCH_JSET_ENTRY_usage: { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ switch (entry->btree_id) { -+ case BCH_FS_USAGE_reserved: -+ if (entry->level < BCH_REPLICAS_MAX) -+ c->usage_base->persistent_reserved[entry->level] = -+ le64_to_cpu(u->v); -+ break; -+ case BCH_FS_USAGE_inodes: -+ c->usage_base->nr_inodes = le64_to_cpu(u->v); -+ break; -+ case BCH_FS_USAGE_key_version: -+ atomic64_set(&c->key_version, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ -+ break; -+ } -+ case BCH_JSET_ENTRY_data_usage: { -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ -+ ret = bch2_replicas_set_usage(c, &u->r, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ case BCH_JSET_ENTRY_dev_usage: { -+ struct jset_entry_dev_usage *u = -+ container_of(entry, struct jset_entry_dev_usage, entry); -+ struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); -+ unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); -+ -+ ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); -+ -+ for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { -+ ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); -+ ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); -+ ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); -+ } -+ -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist: { -+ struct jset_entry_blacklist *bl_entry = -+ container_of(entry, struct jset_entry_blacklist, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->seq), -+ le64_to_cpu(bl_entry->seq) + 1); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist_v2: { -+ struct jset_entry_blacklist_v2 *bl_entry = -+ container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->start), -+ le64_to_cpu(bl_entry->end) + 1); -+ break; -+ } -+ case BCH_JSET_ENTRY_clock: { -+ struct jset_entry_clock *clock = -+ container_of(entry, struct jset_entry_clock, entry); -+ -+ atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); -+ } -+ } -+ -+ return ret; -+} -+ -+static int journal_replay_early(struct bch_fs *c, -+ struct bch_sb_field_clean *clean) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ if (clean) { -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } else { -+ struct genradix_iter iter; -+ struct journal_replay *i, **_i; -+ -+ genradix_for_each(&c->journal_entries, iter, _i) { -+ i = *_i; -+ -+ if (!i || i->ignore) -+ continue; -+ -+ vstruct_for_each(&i->j, entry) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } -+ } -+ -+ bch2_fs_usage_initialize(c); -+ -+ return 0; -+} -+ -+/* sb clean section: */ -+ -+static bool btree_id_is_alloc(enum btree_id id) -+{ -+ switch (id) { -+ case BTREE_ID_alloc: -+ case BTREE_ID_backpointers: -+ case BTREE_ID_need_discard: -+ case BTREE_ID_freespace: -+ case BTREE_ID_bucket_gens: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static int read_btree_roots(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ for (i = 0; i < btree_id_nr_alive(c); i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (!r->alive) -+ continue; -+ -+ if (btree_id_is_alloc(i) && -+ c->opts.reconstruct_alloc) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ continue; -+ } -+ -+ if (r->error) { -+ __fsck_err(c, btree_id_is_alloc(i) -+ ? FSCK_CAN_IGNORE : 0, -+ "invalid btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_alloc) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ } -+ -+ ret = bch2_btree_root_read(c, i, &r->key, r->level); -+ if (ret) { -+ __fsck_err(c, -+ btree_id_is_alloc(i) -+ ? FSCK_CAN_IGNORE : 0, -+ "error reading btree root %s", -+ bch2_btree_ids[i]); -+ if (btree_id_is_alloc(i)) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ } -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = bch2_btree_id_root(c, i); -+ -+ if (!r->b) { -+ r->alive = false; -+ r->level = 0; -+ bch2_btree_root_alloc(c, i); -+ } -+ } -+fsck_err: -+ return ret; -+} -+ -+static int bch2_initialize_subvolumes(struct bch_fs *c) -+{ -+ struct bkey_i_snapshot_tree root_tree; -+ struct bkey_i_snapshot root_snapshot; -+ struct bkey_i_subvolume root_volume; -+ int ret; -+ -+ bkey_snapshot_tree_init(&root_tree.k_i); -+ root_tree.k.p.offset = 1; -+ root_tree.v.master_subvol = cpu_to_le32(1); -+ root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); -+ -+ bkey_snapshot_init(&root_snapshot.k_i); -+ root_snapshot.k.p.offset = U32_MAX; -+ root_snapshot.v.flags = 0; -+ root_snapshot.v.parent = 0; -+ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); -+ root_snapshot.v.tree = cpu_to_le32(1); -+ SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); -+ -+ bkey_subvolume_init(&root_volume.k_i); -+ root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; -+ root_volume.v.flags = 0; -+ root_volume.v.snapshot = cpu_to_le32(U32_MAX); -+ root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); -+ -+ ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, -+ &root_tree.k_i, -+ NULL, NULL, 0) ?: -+ bch2_btree_insert(c, BTREE_ID_snapshots, -+ &root_snapshot.k_i, -+ NULL, NULL, 0) ?: -+ bch2_btree_insert(c, BTREE_ID_subvolumes, -+ &root_volume.k_i, -+ NULL, NULL, 0); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_inode_unpacked inode; -+ int ret; -+ -+ k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, -+ SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (!bkey_is_inode(k.k)) { -+ bch_err(trans->c, "root inode not found"); -+ ret = -BCH_ERR_ENOENT_inode; -+ goto err; -+ } -+ -+ ret = bch2_inode_unpack(k, &inode); -+ BUG_ON(ret); -+ -+ inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; -+ -+ ret = bch2_inode_write(trans, &iter, &inode); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* set bi_subvol on root inode */ -+noinline_for_stack -+static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) -+{ -+ int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, -+ __bch2_fs_upgrade_for_subvolumes(&trans)); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+const char * const bch2_recovery_passes[] = { -+#define x(_fn, _when) #_fn, -+ BCH_RECOVERY_PASSES() -+#undef x -+ NULL -+}; -+ -+static int bch2_check_allocations(struct bch_fs *c) -+{ -+ return bch2_gc(c, true, c->opts.norecovery); -+} -+ -+static int bch2_set_may_go_rw(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); -+ return 0; -+} -+ -+struct recovery_pass_fn { -+ int (*fn)(struct bch_fs *); -+ unsigned when; -+}; -+ -+static struct recovery_pass_fn recovery_pass_fns[] = { -+#define x(_fn, _when) { .fn = bch2_##_fn, .when = _when }, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ -+static void check_version_upgrade(struct bch_fs *c) -+{ -+ unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version); -+ unsigned latest_version = bcachefs_metadata_version_current; -+ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; -+ unsigned new_version = 0; -+ u64 recovery_passes; -+ -+ if (old_version < bcachefs_metadata_required_upgrade_below) { -+ if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || -+ latest_compatible < bcachefs_metadata_required_upgrade_below) -+ new_version = latest_version; -+ else -+ new_version = latest_compatible; -+ } else { -+ switch (c->opts.version_upgrade) { -+ case BCH_VERSION_UPGRADE_compatible: -+ new_version = latest_compatible; -+ break; -+ case BCH_VERSION_UPGRADE_incompatible: -+ new_version = latest_version; -+ break; -+ case BCH_VERSION_UPGRADE_none: -+ new_version = old_version; -+ break; -+ } -+ } -+ -+ if (new_version > old_version) { -+ struct printbuf buf = PRINTBUF; -+ -+ if (old_version < bcachefs_metadata_required_upgrade_below) -+ prt_str(&buf, "Version upgrade required:\n"); -+ -+ if (old_version != c->sb.version) { -+ prt_str(&buf, "Version upgrade from "); -+ bch2_version_to_text(&buf, c->sb.version_upgrade_complete); -+ prt_str(&buf, " to "); -+ bch2_version_to_text(&buf, c->sb.version); -+ prt_str(&buf, " incomplete\n"); -+ } -+ -+ prt_printf(&buf, "Doing %s version upgrade from ", -+ BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) -+ ? "incompatible" : "compatible"); -+ bch2_version_to_text(&buf, old_version); -+ prt_str(&buf, " to "); -+ bch2_version_to_text(&buf, new_version); -+ prt_newline(&buf); -+ -+ recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); -+ if (recovery_passes) { -+ if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK) -+ prt_str(&buf, "fsck required"); -+ else { -+ prt_str(&buf, "running recovery passses: "); -+ prt_bitflags(&buf, bch2_recovery_passes, recovery_passes); -+ } -+ -+ c->recovery_passes_explicit |= recovery_passes; -+ c->opts.fix_errors = FSCK_FIX_yes; -+ } -+ -+ bch_info(c, "%s", buf.buf); -+ -+ mutex_lock(&c->sb_lock); -+ bch2_sb_upgrade(c, new_version); -+ mutex_unlock(&c->sb_lock); -+ -+ printbuf_exit(&buf); -+ } -+} -+ -+u64 bch2_fsck_recovery_passes(void) -+{ -+ u64 ret = 0; -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) -+ if (recovery_pass_fns[i].when & PASS_FSCK) -+ ret |= BIT_ULL(i); -+ return ret; -+} -+ -+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -+{ -+ struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass; -+ -+ if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) -+ return false; -+ if (c->recovery_passes_explicit & BIT_ULL(pass)) -+ return true; -+ if ((p->when & PASS_FSCK) && c->opts.fsck) -+ return true; -+ if ((p->when & PASS_UNCLEAN) && !c->sb.clean) -+ return true; -+ if (p->when & PASS_ALWAYS) -+ return true; -+ return false; -+} -+ -+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) -+{ -+ int ret; -+ -+ c->curr_recovery_pass = pass; -+ -+ if (should_run_recovery_pass(c, pass)) { -+ struct recovery_pass_fn *p = recovery_pass_fns + pass; -+ -+ if (!(p->when & PASS_SILENT)) -+ printk(KERN_INFO bch2_log_msg(c, "%s..."), -+ bch2_recovery_passes[pass]); -+ ret = p->fn(c); -+ if (ret) -+ return ret; -+ if (!(p->when & PASS_SILENT)) -+ printk(KERN_CONT " done\n"); -+ -+ c->recovery_passes_complete |= BIT_ULL(pass); -+ } -+ -+ return 0; -+} -+ -+static int bch2_run_recovery_passes(struct bch_fs *c) -+{ -+ int ret = 0; -+ -+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) { -+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); -+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) -+ continue; -+ if (ret) -+ break; -+ c->curr_recovery_pass++; -+ } -+ -+ return ret; -+} -+ -+int bch2_fs_recovery(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean = NULL; -+ struct jset *last_journal_entry = NULL; -+ u64 last_seq, blacklist_seq, journal_seq; -+ bool write_sb = false; -+ int ret = 0; -+ -+ if (c->sb.clean) { -+ clean = bch2_read_superblock_clean(c); -+ ret = PTR_ERR_OR_ZERO(clean); -+ if (ret) -+ goto err; -+ -+ bch_info(c, "recovering from clean shutdown, journal seq %llu", -+ le64_to_cpu(clean->journal_seq)); -+ } else { -+ bch_info(c, "recovering from unclean shutdown"); -+ } -+ -+ if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { -+ bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (!c->sb.clean && -+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { -+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) -+ check_version_upgrade(c); -+ -+ if (c->opts.fsck && c->opts.norecovery) { -+ bch_err(c, "cannot select both norecovery and fsck"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = bch2_blacklist_table_initialize(c); -+ if (ret) { -+ bch_err(c, "error initializing blacklist table"); -+ goto err; -+ } -+ -+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { -+ struct genradix_iter iter; -+ struct journal_replay **i; -+ -+ bch_verbose(c, "starting journal read"); -+ ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); -+ if (ret) -+ goto err; -+ -+ /* -+ * note: cmd_list_journal needs the blacklist table fully up to date so -+ * it can asterisk ignored journal entries: -+ */ -+ if (c->opts.read_journal_only) -+ goto out; -+ -+ genradix_for_each_reverse(&c->journal_entries, iter, i) -+ if (*i && !(*i)->ignore) { -+ last_journal_entry = &(*i)->j; -+ break; -+ } -+ -+ if (mustfix_fsck_err_on(c->sb.clean && -+ last_journal_entry && -+ !journal_entry_empty(last_journal_entry), c, -+ "filesystem marked clean but journal not empty")) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ } -+ -+ if (!last_journal_entry) { -+ fsck_err_on(!c->sb.clean, c, "no journal entries found"); -+ if (clean) -+ goto use_clean; -+ -+ genradix_for_each_reverse(&c->journal_entries, iter, i) -+ if (*i) { -+ last_journal_entry = &(*i)->j; -+ (*i)->ignore = false; -+ break; -+ } -+ } -+ -+ ret = bch2_journal_keys_sort(c); -+ if (ret) -+ goto err; -+ -+ if (c->sb.clean && last_journal_entry) { -+ ret = bch2_verify_superblock_clean(c, &clean, -+ last_journal_entry); -+ if (ret) -+ goto err; -+ } -+ } else { -+use_clean: -+ if (!clean) { -+ bch_err(c, "no superblock clean section found"); -+ ret = -BCH_ERR_fsck_repair_impossible; -+ goto err; -+ -+ } -+ blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; -+ } -+ -+ c->journal_replay_seq_start = last_seq; -+ c->journal_replay_seq_end = blacklist_seq - 1; -+ -+ if (c->opts.reconstruct_alloc) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); -+ drop_alloc_keys(&c->journal_keys); -+ } -+ -+ zero_out_btree_mem_ptr(&c->journal_keys); -+ -+ ret = journal_replay_early(c, clean); -+ if (ret) -+ goto err; -+ -+ /* -+ * After an unclean shutdown, skip then next few journal sequence -+ * numbers as they may have been referenced by btree writes that -+ * happened before their corresponding journal writes - those btree -+ * writes need to be ignored, by skipping and blacklisting the next few -+ * journal sequence numbers: -+ */ -+ if (!c->sb.clean) -+ journal_seq += 8; -+ -+ if (blacklist_seq != journal_seq) { -+ ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", -+ blacklist_seq, journal_seq) ?: -+ bch2_journal_seq_blacklist_add(c, -+ blacklist_seq, journal_seq); -+ if (ret) { -+ bch_err(c, "error creating new journal seq blacklist entry"); -+ goto err; -+ } -+ } -+ -+ ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", -+ journal_seq, last_seq, blacklist_seq - 1) ?: -+ bch2_fs_journal_start(&c->journal, journal_seq); -+ if (ret) -+ goto err; -+ -+ if (c->opts.reconstruct_alloc) -+ bch2_journal_log_msg(c, "dropping alloc info"); -+ -+ /* -+ * Skip past versions that might have possibly been used (as nonces), -+ * but hadn't had their pointers written: -+ */ -+ if (c->sb.encryption_type && !c->sb.clean) -+ atomic64_add(1 << 16, &c->key_version); -+ -+ ret = read_btree_roots(c); -+ if (ret) -+ goto err; -+ -+ if (c->opts.fsck && -+ (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || -+ BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb))) -+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology); -+ -+ ret = bch2_run_recovery_passes(c); -+ if (ret) -+ goto err; -+ -+ /* If we fixed errors, verify that fs is actually clean now: */ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && -+ test_bit(BCH_FS_ERRORS_FIXED, &c->flags) && -+ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) && -+ !test_bit(BCH_FS_ERROR, &c->flags)) { -+ bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); -+ clear_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ -+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; -+ -+ ret = bch2_run_recovery_passes(c); -+ if (ret) -+ goto err; -+ -+ if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) || -+ test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { -+ bch_err(c, "Second fsck run was not clean"); -+ set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); -+ } -+ -+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ } -+ -+ if (enabled_qtypes(c)) { -+ bch_verbose(c, "reading quotas"); -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "quotas done"); -+ } -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { -+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); -+ write_sb = true; -+ } -+ -+ if (!test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); -+ write_sb = true; -+ } -+ -+ if (c->opts.fsck && -+ !test_bit(BCH_FS_ERROR, &c->flags) && -+ !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); -+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); -+ write_sb = true; -+ } -+ -+ if (write_sb) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || -+ c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { -+ struct bch_move_stats stats; -+ -+ bch2_move_stats_init(&stats, "recovery"); -+ -+ bch_info(c, "scanning for old btree nodes"); -+ ret = bch2_fs_read_write(c) ?: -+ bch2_scan_old_btree_nodes(c, &stats); -+ if (ret) -+ goto err; -+ bch_info(c, "scanning for old btree nodes done"); -+ } -+ -+ if (c->journal_seq_blacklist_table && -+ c->journal_seq_blacklist_table->nr > 128) -+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); -+ -+ ret = 0; -+out: -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ bch2_flush_fsck_errs(c); -+ -+ if (!c->opts.keep_journal && -+ test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(c); -+ } -+ kfree(clean); -+ -+ if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { -+ bch2_fs_read_write_early(c); -+ bch2_delete_dead_snapshots_async(c); -+ } -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+err: -+fsck_err: -+ bch2_fs_emergency_read_only(c); -+ goto out; -+} -+ -+int bch2_fs_initialize(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ struct bkey_inode_buf packed_inode; -+ struct qstr lostfound = QSTR("lost+found"); -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ bch_notice(c, "initializing new filesystem"); -+ -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); -+ -+ bch2_sb_maybe_downgrade(c); -+ -+ if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { -+ bch2_sb_upgrade(c, bcachefs_metadata_version_current); -+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ -+ c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns); -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ bch2_btree_root_alloc(c, i); -+ -+ for_each_online_member(ca, c, i) -+ bch2_dev_usage_init(ca); -+ -+ for_each_online_member(ca, c, i) { -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ /* -+ * journal_res_get() will crash if called before this has -+ * set up the journal.pin FIFO and journal.cur pointer: -+ */ -+ bch2_fs_journal_start(&c->journal, 1); -+ bch2_journal_set_replay_done(&c->journal); -+ -+ ret = bch2_fs_read_write_early(c); -+ if (ret) -+ goto err; -+ -+ /* -+ * Write out the superblock and journal buckets, now that we can do -+ * btree updates -+ */ -+ bch_verbose(c, "marking superblocks"); -+ for_each_member_device(ca, c, i) { -+ ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) { -+ percpu_ref_put(&ca->ref); -+ goto err; -+ } -+ -+ ca->new_fs_bucket_idx = 0; -+ } -+ -+ ret = bch2_fs_freespace_init(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_initialize_subvolumes(c); -+ if (ret) -+ goto err; -+ -+ bch_verbose(c, "reading snapshots table"); -+ ret = bch2_snapshots_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "reading snapshots done"); -+ -+ bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); -+ root_inode.bi_inum = BCACHEFS_ROOT_INO; -+ root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; -+ bch2_inode_pack(&packed_inode, &root_inode); -+ packed_inode.inode.k.p.snapshot = U32_MAX; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_inodes, -+ &packed_inode.inode.k_i, -+ NULL, NULL, 0); -+ if (ret) { -+ bch_err_msg(c, ret, "creating root directory"); -+ goto err; -+ } -+ -+ bch2_inode_init_early(c, &lostfound_inode); -+ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_create_trans(&trans, -+ BCACHEFS_ROOT_SUBVOL_INUM, -+ &root_inode, &lostfound_inode, -+ &lostfound, -+ 0, 0, S_IFDIR|0700, 0, -+ NULL, NULL, (subvol_inum) { 0 }, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "creating lost+found"); -+ goto err; -+ } -+ -+ if (enabled_qtypes(c)) { -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_journal_flush(&c->journal); -+ if (ret) { -+ bch_err_msg(c, ret, "writing first journal entry"); -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+err: -+ bch_err_fn(ca, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -new file mode 100644 -index 000000000..852d30567 ---- /dev/null -+++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,33 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_H -+#define _BCACHEFS_RECOVERY_H -+ -+extern const char * const bch2_recovery_passes[]; -+ -+/* -+ * For when we need to rewind recovery passes and run a pass we skipped: -+ */ -+static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c, -+ enum bch_recovery_pass pass) -+{ -+ bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", -+ bch2_recovery_passes[pass], pass, -+ bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); -+ -+ c->recovery_passes_explicit |= BIT_ULL(pass); -+ -+ if (c->curr_recovery_pass >= pass) { -+ c->curr_recovery_pass = pass; -+ c->recovery_passes_complete &= (1ULL << pass) >> 1; -+ return -BCH_ERR_restart_recovery; -+ } else { -+ return 0; -+ } -+} -+ -+u64 bch2_fsck_recovery_passes(void); -+ -+int bch2_fs_recovery(struct bch_fs *); -+int bch2_fs_initialize(struct bch_fs *); -+ -+#endif /* _BCACHEFS_RECOVERY_H */ -diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h -new file mode 100644 -index 000000000..abf1f834e ---- /dev/null -+++ b/fs/bcachefs/recovery_types.h -@@ -0,0 +1,48 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_TYPES_H -+#define _BCACHEFS_RECOVERY_TYPES_H -+ -+#define PASS_SILENT BIT(0) -+#define PASS_FSCK BIT(1) -+#define PASS_UNCLEAN BIT(2) -+#define PASS_ALWAYS BIT(3) -+ -+#define BCH_RECOVERY_PASSES() \ -+ x(alloc_read, PASS_ALWAYS) \ -+ x(stripes_read, PASS_ALWAYS) \ -+ x(initialize_subvolumes, 0) \ -+ x(snapshots_read, PASS_ALWAYS) \ -+ x(check_topology, 0) \ -+ x(check_allocations, PASS_FSCK) \ -+ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ -+ x(journal_replay, PASS_ALWAYS) \ -+ x(check_alloc_info, PASS_FSCK) \ -+ x(check_lrus, PASS_FSCK) \ -+ x(check_btree_backpointers, PASS_FSCK) \ -+ x(check_backpointers_to_extents,PASS_FSCK) \ -+ x(check_extents_to_backpointers,PASS_FSCK) \ -+ x(check_alloc_to_lru_refs, PASS_FSCK) \ -+ x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ -+ x(bucket_gens_init, 0) \ -+ x(check_snapshot_trees, PASS_FSCK) \ -+ x(check_snapshots, PASS_FSCK) \ -+ x(check_subvols, PASS_FSCK) \ -+ x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN) \ -+ x(fs_upgrade_for_subvolumes, 0) \ -+ x(check_inodes, PASS_FSCK) \ -+ x(check_extents, PASS_FSCK) \ -+ x(check_dirents, PASS_FSCK) \ -+ x(check_xattrs, PASS_FSCK) \ -+ x(check_root, PASS_FSCK) \ -+ x(check_directory_structure, PASS_FSCK) \ -+ x(check_nlinks, PASS_FSCK) \ -+ x(delete_dead_inodes, PASS_FSCK|PASS_UNCLEAN) \ -+ x(fix_reflink_p, 0) \ -+ -+enum bch_recovery_pass { -+#define x(n, when) BCH_RECOVERY_PASS_##n, -+ BCH_RECOVERY_PASSES() -+#undef x -+}; -+ -+#endif /* _BCACHEFS_RECOVERY_TYPES_H */ -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -new file mode 100644 -index 000000000..39f711d50 ---- /dev/null -+++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,399 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "extents.h" -+#include "inode.h" -+#include "io.h" -+#include "reflink.h" -+#include "subvolume.h" -+ -+#include -+ -+static inline unsigned bkey_type_to_indirect(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_extent: -+ return KEY_TYPE_reflink_v; -+ case KEY_TYPE_inline_data: -+ return KEY_TYPE_indirect_inline_data; -+ default: -+ return 0; -+ } -+} -+ -+/* reflink pointers */ -+ -+int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && -+ le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { -+ prt_printf(err, "idx < front_pad (%llu < %u)", -+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ prt_printf(out, "idx %llu front_pad %u back_pad %u", -+ le64_to_cpu(p.v->idx), -+ le32_to_cpu(p.v->front_pad), -+ le32_to_cpu(p.v->back_pad)); -+} -+ -+bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); -+ struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); -+ -+ /* -+ * Disabled for now, the triggers code needs to be reworked for merging -+ * of reflink pointers to work: -+ */ -+ return false; -+ -+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) -+ return false; -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ return true; -+} -+ -+/* indirect extents */ -+ -+int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ return bch2_bkey_ptrs_invalid(c, k, flags, err); -+} -+ -+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -+ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); -+ -+ return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); -+} -+ -+int bch2_trans_mark_reflink_v(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { -+ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); -+ -+ if (!r->v.refcount) { -+ r->k.type = KEY_TYPE_deleted; -+ r->k.size = 0; -+ set_bkey_val_u64s(&r->k, 0); -+ return 0; -+ } -+ } -+ -+ return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); -+} -+ -+/* indirect inline data */ -+ -+int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ return 0; -+} -+ -+void bch2_indirect_inline_data_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); -+ unsigned datalen = bkey_inline_data_bytes(k.k); -+ -+ prt_printf(out, "refcount %llu datalen %u: %*phN", -+ le64_to_cpu(d.v->refcount), datalen, -+ min(datalen, 32U), d.v->data); -+} -+ -+int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { -+ struct bkey_i_indirect_inline_data *r = -+ bkey_i_to_indirect_inline_data(new); -+ -+ if (!r->v.refcount) { -+ r->k.type = KEY_TYPE_deleted; -+ r->k.size = 0; -+ set_bkey_val_u64s(&r->k, 0); -+ } -+ } -+ -+ return 0; -+} -+ -+static int bch2_make_extent_indirect(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i *orig) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter reflink_iter = { NULL }; -+ struct bkey_s_c k; -+ struct bkey_i *r_v; -+ struct bkey_i_reflink_p *r_p; -+ __le64 *refcount; -+ int ret; -+ -+ if (orig->k.type == KEY_TYPE_inline_data) -+ bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); -+ -+ bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, -+ BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_prev(&reflink_iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_init(&r_v->k); -+ r_v->k.type = bkey_type_to_indirect(&orig->k); -+ r_v->k.p = reflink_iter.pos; -+ bch2_key_resize(&r_v->k, orig->k.size); -+ r_v->k.version = orig->k.version; -+ -+ set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); -+ -+ refcount = bkey_refcount(r_v); -+ *refcount = 0; -+ memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); -+ -+ ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); -+ if (ret) -+ goto err; -+ -+ /* -+ * orig is in a bkey_buf which statically allocates 5 64s for the val, -+ * so we know it will be big enough: -+ */ -+ orig->k.type = KEY_TYPE_reflink_p; -+ r_p = bkey_i_to_reflink_p(orig); -+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); -+ -+ /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ -+#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) -+ __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); -+#else -+ memset(&r_p->v, 0, sizeof(r_p->v)); -+#endif -+ -+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); -+ -+ ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+err: -+ bch2_trans_iter_exit(trans, &reflink_iter); -+ -+ return ret; -+} -+ -+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) -+{ -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { -+ if (bkey_extent_is_unwritten(k)) -+ continue; -+ -+ if (bkey_extent_is_data(k.k)) -+ return k; -+ } -+ -+ if (bkey_ge(iter->pos, end)) -+ bch2_btree_iter_set_pos(iter, end); -+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null; -+} -+ -+s64 bch2_remap_range(struct bch_fs *c, -+ subvol_inum dst_inum, u64 dst_offset, -+ subvol_inum src_inum, u64 src_offset, -+ u64 remap_sectors, -+ u64 new_i_size, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter dst_iter, src_iter; -+ struct bkey_s_c src_k; -+ struct bkey_buf new_dst, new_src; -+ struct bpos dst_start = POS(dst_inum.inum, dst_offset); -+ struct bpos src_start = POS(src_inum.inum, src_offset); -+ struct bpos dst_end = dst_start, src_end = src_start; -+ struct bpos src_want; -+ u64 dst_done; -+ u32 dst_snapshot, src_snapshot; -+ int ret = 0, ret2 = 0; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) -+ return -BCH_ERR_erofs_no_writes; -+ -+ bch2_check_set_feature(c, BCH_FEATURE_reflink); -+ -+ dst_end.offset += remap_sectors; -+ src_end.offset += remap_sectors; -+ -+ bch2_bkey_buf_init(&new_dst); -+ bch2_bkey_buf_init(&new_src); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); -+ -+ bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, -+ BTREE_ITER_INTENT); -+ bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, -+ BTREE_ITER_INTENT); -+ -+ while ((ret == 0 || -+ bch2_err_matches(ret, BCH_ERR_transaction_restart)) && -+ bkey_lt(dst_iter.pos, dst_end)) { -+ struct disk_reservation disk_res = { 0 }; -+ -+ bch2_trans_begin(&trans); -+ -+ if (fatal_signal_pending(current)) { -+ ret = -EINTR; -+ break; -+ } -+ -+ ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, -+ &src_snapshot); -+ if (ret) -+ continue; -+ -+ bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); -+ -+ ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, -+ &dst_snapshot); -+ if (ret) -+ continue; -+ -+ bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); -+ -+ dst_done = dst_iter.pos.offset - dst_start.offset; -+ src_want = POS(src_start.inode, src_start.offset + dst_done); -+ bch2_btree_iter_set_pos(&src_iter, src_want); -+ -+ src_k = get_next_src(&src_iter, src_end); -+ ret = bkey_err(src_k); -+ if (ret) -+ continue; -+ -+ if (bkey_lt(src_want, src_iter.pos)) { -+ ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, -+ min(dst_end.offset, -+ dst_iter.pos.offset + -+ src_iter.pos.offset - src_want.offset), -+ i_sectors_delta); -+ continue; -+ } -+ -+ if (src_k.k->type != KEY_TYPE_reflink_p) { -+ bch2_btree_iter_set_pos_to_extent_start(&src_iter); -+ -+ bch2_bkey_buf_reassemble(&new_src, c, src_k); -+ src_k = bkey_i_to_s_c(new_src.k); -+ -+ ret = bch2_make_extent_indirect(&trans, &src_iter, -+ new_src.k); -+ if (ret) -+ continue; -+ -+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); -+ } -+ -+ if (src_k.k->type == KEY_TYPE_reflink_p) { -+ struct bkey_s_c_reflink_p src_p = -+ bkey_s_c_to_reflink_p(src_k); -+ struct bkey_i_reflink_p *dst_p = -+ bkey_reflink_p_init(new_dst.k); -+ -+ u64 offset = le64_to_cpu(src_p.v->idx) + -+ (src_want.offset - -+ bkey_start_offset(src_k.k)); -+ -+ dst_p->v.idx = cpu_to_le64(offset); -+ } else { -+ BUG(); -+ } -+ -+ new_dst.k->k.p = dst_iter.pos; -+ bch2_key_resize(&new_dst.k->k, -+ min(src_k.k->p.offset - src_want.offset, -+ dst_end.offset - dst_iter.pos.offset)); -+ -+ ret = bch2_extent_update(&trans, dst_inum, &dst_iter, -+ new_dst.k, &disk_res, -+ new_i_size, i_sectors_delta, -+ true); -+ bch2_disk_reservation_put(c, &disk_res); -+ } -+ bch2_trans_iter_exit(&trans, &dst_iter); -+ bch2_trans_iter_exit(&trans, &src_iter); -+ -+ BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); -+ BUG_ON(bkey_gt(dst_iter.pos, dst_end)); -+ -+ dst_done = dst_iter.pos.offset - dst_start.offset; -+ new_i_size = min(dst_iter.pos.offset << 9, new_i_size); -+ -+ do { -+ struct bch_inode_unpacked inode_u; -+ struct btree_iter inode_iter = { NULL }; -+ -+ bch2_trans_begin(&trans); -+ -+ ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, -+ dst_inum, BTREE_ITER_INTENT); -+ -+ if (!ret2 && -+ inode_u.bi_size < new_i_size) { -+ inode_u.bi_size = new_i_size; -+ ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ } -+ -+ bch2_trans_iter_exit(&trans, &inode_iter); -+ } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); -+ -+ bch2_trans_exit(&trans); -+ bch2_bkey_buf_exit(&new_src, c); -+ bch2_bkey_buf_exit(&new_dst, c); -+ -+ bch2_write_ref_put(c, BCH_WRITE_REF_reflink); -+ -+ return dst_done ?: ret ?: ret2; -+} -diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h -new file mode 100644 -index 000000000..fe52538ef ---- /dev/null -+++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,81 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REFLINK_H -+#define _BCACHEFS_REFLINK_H -+ -+enum bkey_invalid_flags; -+ -+int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); -+ -+#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_p_invalid, \ -+ .val_to_text = bch2_reflink_p_to_text, \ -+ .key_merge = bch2_reflink_p_merge, \ -+ .trans_trigger = bch2_trans_mark_reflink_p, \ -+ .atomic_trigger = bch2_mark_reflink_p, \ -+ .min_val_size = 16, \ -+}) -+ -+int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_i *, unsigned); -+ -+#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_v_invalid, \ -+ .val_to_text = bch2_reflink_v_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .trans_trigger = bch2_trans_mark_reflink_v, \ -+ .atomic_trigger = bch2_mark_extent, \ -+ .min_val_size = 8, \ -+}) -+ -+int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_indirect_inline_data_to_text(struct printbuf *, -+ struct bch_fs *, struct bkey_s_c); -+int bch2_trans_mark_indirect_inline_data(struct btree_trans *, -+ enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_i *, -+ unsigned); -+ -+#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ -+ .key_invalid = bch2_indirect_inline_data_invalid, \ -+ .val_to_text = bch2_indirect_inline_data_to_text, \ -+ .trans_trigger = bch2_trans_mark_indirect_inline_data, \ -+ .min_val_size = 8, \ -+}) -+ -+static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_reflink_v: -+ return &bkey_s_c_to_reflink_v(k).v->refcount; -+ case KEY_TYPE_indirect_inline_data: -+ return &bkey_s_c_to_indirect_inline_data(k).v->refcount; -+ default: -+ return NULL; -+ } -+} -+ -+static inline __le64 *bkey_refcount(struct bkey_i *k) -+{ -+ switch (k->k.type) { -+ case KEY_TYPE_reflink_v: -+ return &bkey_i_to_reflink_v(k)->v.refcount; -+ case KEY_TYPE_indirect_inline_data: -+ return &bkey_i_to_indirect_inline_data(k)->v.refcount; -+ default: -+ return NULL; -+ } -+} -+ -+s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, -+ subvol_inum, u64, u64, u64, s64 *); -+ -+#endif /* _BCACHEFS_REFLINK_H */ -diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c -new file mode 100644 -index 000000000..5b591c59b ---- /dev/null -+++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1059 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "buckets.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, -+ struct bch_replicas_cpu *); -+ -+/* Replicas tracking - in memory: */ -+ -+static void verify_replicas_entry(struct bch_replicas_entry *e) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ unsigned i; -+ -+ BUG_ON(e->data_type >= BCH_DATA_NR); -+ BUG_ON(!e->nr_devs); -+ BUG_ON(e->nr_required > 1 && -+ e->nr_required >= e->nr_devs); -+ -+ for (i = 0; i + 1 < e->nr_devs; i++) -+ BUG_ON(e->devs[i] >= e->devs[i + 1]); -+#endif -+} -+ -+void bch2_replicas_entry_sort(struct bch_replicas_entry *e) -+{ -+ bubble_sort(e->devs, e->nr_devs, u8_cmp); -+} -+ -+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -+{ -+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -+} -+ -+static void bch2_replicas_entry_v0_to_text(struct printbuf *out, -+ struct bch_replicas_entry_v0 *e) -+{ -+ unsigned i; -+ -+ if (e->data_type < BCH_DATA_NR) -+ prt_printf(out, "%s", bch2_data_types[e->data_type]); -+ else -+ prt_printf(out, "(invalid data type %u)", e->data_type); -+ -+ prt_printf(out, ": %u [", e->nr_devs); -+ for (i = 0; i < e->nr_devs; i++) -+ prt_printf(out, i ? " %u" : "%u", e->devs[i]); -+ prt_printf(out, "]"); -+} -+ -+void bch2_replicas_entry_to_text(struct printbuf *out, -+ struct bch_replicas_entry *e) -+{ -+ unsigned i; -+ -+ if (e->data_type < BCH_DATA_NR) -+ prt_printf(out, "%s", bch2_data_types[e->data_type]); -+ else -+ prt_printf(out, "(invalid data type %u)", e->data_type); -+ -+ prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); -+ for (i = 0; i < e->nr_devs; i++) -+ prt_printf(out, i ? " %u" : "%u", e->devs[i]); -+ prt_printf(out, "]"); -+} -+ -+void bch2_cpu_replicas_to_text(struct printbuf *out, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_cpu_replicas_entry(r, e) { -+ if (!first) -+ prt_printf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+static void extent_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ r->nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ continue; -+ -+ if (!p.has_ec) -+ r->devs[r->nr_devs++] = p.ptr.dev; -+ else -+ r->nr_required = 0; -+ } -+} -+ -+static void stripe_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ const struct bch_extent_ptr *ptr; -+ -+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; -+ -+ for (ptr = s.v->ptrs; -+ ptr < s.v->ptrs + s.v->nr_blocks; -+ ptr++) -+ r->devs[r->nr_devs++] = ptr->dev; -+} -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *e, -+ struct bkey_s_c k) -+{ -+ e->nr_devs = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ e->data_type = BCH_DATA_btree; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ e->data_type = BCH_DATA_user; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_stripe: -+ e->data_type = BCH_DATA_parity; -+ stripe_to_replicas(k, e); -+ break; -+ } -+ -+ bch2_replicas_entry_sort(e); -+} -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *e, -+ enum bch_data_type data_type, -+ struct bch_devs_list devs) -+{ -+ unsigned i; -+ -+ BUG_ON(!data_type || -+ data_type == BCH_DATA_sb || -+ data_type >= BCH_DATA_NR); -+ -+ e->data_type = data_type; -+ e->nr_devs = 0; -+ e->nr_required = 1; -+ -+ for (i = 0; i < devs.nr; i++) -+ e->devs[e->nr_devs++] = devs.devs[i]; -+ -+ bch2_replicas_entry_sort(e); -+} -+ -+static struct bch_replicas_cpu -+cpu_replicas_add_entry(struct bch_replicas_cpu *old, -+ struct bch_replicas_entry *new_entry) -+{ -+ unsigned i; -+ struct bch_replicas_cpu new = { -+ .nr = old->nr + 1, -+ .entry_size = max_t(unsigned, old->entry_size, -+ replicas_entry_bytes(new_entry)), -+ }; -+ -+ BUG_ON(!new_entry->data_type); -+ verify_replicas_entry(new_entry); -+ -+ new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); -+ if (!new.entries) -+ return new; -+ -+ for (i = 0; i < old->nr; i++) -+ memcpy(cpu_replicas_entry(&new, i), -+ cpu_replicas_entry(old, i), -+ old->entry_size); -+ -+ memcpy(cpu_replicas_entry(&new, old->nr), -+ new_entry, -+ replicas_entry_bytes(new_entry)); -+ -+ bch2_cpu_replicas_sort(&new); -+ return new; -+} -+ -+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ int idx, entry_size = replicas_entry_bytes(search); -+ -+ if (unlikely(entry_size > r->entry_size)) -+ return -1; -+ -+ verify_replicas_entry(search); -+ -+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) -+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, -+ entry_cmp, search); -+#undef entry_cmp -+ -+ return idx < r->nr ? idx : -1; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ bch2_replicas_entry_sort(search); -+ -+ return __replicas_entry_idx(&c->replicas, search); -+} -+ -+static bool __replicas_has_entry(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ return __replicas_entry_idx(r, search) >= 0; -+} -+ -+bool bch2_replicas_marked(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ bool marked; -+ -+ if (!search->nr_devs) -+ return true; -+ -+ verify_replicas_entry(search); -+ -+ percpu_down_read(&c->mark_lock); -+ marked = __replicas_has_entry(&c->replicas, search) && -+ (likely((!c->replicas_gc.entries)) || -+ __replicas_has_entry(&c->replicas_gc, search)); -+ percpu_up_read(&c->mark_lock); -+ -+ return marked; -+} -+ -+static void __replicas_table_update(struct bch_fs_usage *dst, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage *src, -+ struct bch_replicas_cpu *src_r) -+{ -+ int src_idx, dst_idx; -+ -+ *dst = *src; -+ -+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { -+ if (!src->replicas[src_idx]) -+ continue; -+ -+ dst_idx = __replicas_entry_idx(dst_r, -+ cpu_replicas_entry(src_r, src_idx)); -+ BUG_ON(dst_idx < 0); -+ -+ dst->replicas[dst_idx] = src->replicas[src_idx]; -+ } -+} -+ -+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage __percpu *src_p, -+ struct bch_replicas_cpu *src_r) -+{ -+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; -+ struct bch_fs_usage *dst, *src = (void *) -+ bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); -+ -+ preempt_disable(); -+ dst = this_cpu_ptr(dst_p); -+ preempt_enable(); -+ -+ __replicas_table_update(dst, dst_r, src, src_r); -+} -+ -+/* -+ * Resize filesystem accounting: -+ */ -+static int replicas_table_update(struct bch_fs *c, -+ struct bch_replicas_cpu *new_r) -+{ -+ struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; -+ struct bch_fs_usage_online *new_scratch = NULL; -+ struct bch_fs_usage __percpu *new_gc = NULL; -+ struct bch_fs_usage *new_base = NULL; -+ unsigned i, bytes = sizeof(struct bch_fs_usage) + -+ sizeof(u64) * new_r->nr; -+ unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + -+ sizeof(u64) * new_r->nr; -+ int ret = 0; -+ -+ memset(new_usage, 0, sizeof(new_usage)); -+ -+ for (i = 0; i < ARRAY_SIZE(new_usage); i++) -+ if (!(new_usage[i] = __alloc_percpu_gfp(bytes, -+ sizeof(u64), GFP_KERNEL))) -+ goto err; -+ -+ if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || -+ !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || -+ (c->usage_gc && -+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) -+ goto err; -+ -+ for (i = 0; i < ARRAY_SIZE(new_usage); i++) -+ if (c->usage[i]) -+ __replicas_table_update_pcpu(new_usage[i], new_r, -+ c->usage[i], &c->replicas); -+ if (c->usage_base) -+ __replicas_table_update(new_base, new_r, -+ c->usage_base, &c->replicas); -+ if (c->usage_gc) -+ __replicas_table_update_pcpu(new_gc, new_r, -+ c->usage_gc, &c->replicas); -+ -+ for (i = 0; i < ARRAY_SIZE(new_usage); i++) -+ swap(c->usage[i], new_usage[i]); -+ swap(c->usage_base, new_base); -+ swap(c->usage_scratch, new_scratch); -+ swap(c->usage_gc, new_gc); -+ swap(c->replicas, *new_r); -+out: -+ free_percpu(new_gc); -+ kfree(new_scratch); -+ for (i = 0; i < ARRAY_SIZE(new_usage); i++) -+ free_percpu(new_usage[i]); -+ kfree(new_base); -+ return ret; -+err: -+ bch_err(c, "error updating replicas table: memory allocation failure"); -+ ret = -BCH_ERR_ENOMEM_replicas_table; -+ goto out; -+} -+ -+static unsigned reserve_journal_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ unsigned journal_res_u64s = 0; -+ -+ /* nr_inodes: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* key_version: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* persistent_reserved: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * -+ BCH_REPLICAS_MAX; -+ -+ for_each_cpu_replicas_entry(r, e) -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + -+ e->nr_devs, sizeof(u64)); -+ return journal_res_u64s; -+} -+ -+noinline -+static int bch2_mark_replicas_slowpath(struct bch_fs *c, -+ struct bch_replicas_entry *new_entry) -+{ -+ struct bch_replicas_cpu new_r, new_gc; -+ int ret = 0; -+ -+ verify_replicas_entry(new_entry); -+ -+ memset(&new_r, 0, sizeof(new_r)); -+ memset(&new_gc, 0, sizeof(new_gc)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (c->replicas_gc.entries && -+ !__replicas_has_entry(&c->replicas_gc, new_entry)) { -+ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); -+ if (!new_gc.entries) { -+ ret = -BCH_ERR_ENOMEM_cpu_replicas; -+ goto err; -+ } -+ } -+ -+ if (!__replicas_has_entry(&c->replicas, new_entry)) { -+ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); -+ if (!new_r.entries) { -+ ret = -BCH_ERR_ENOMEM_cpu_replicas; -+ goto err; -+ } -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); -+ if (ret) -+ goto err; -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->replicas_journal_res, -+ reserve_journal_replicas(c, &new_r)); -+ } -+ -+ if (!new_r.entries && -+ !new_gc.entries) -+ goto out; -+ -+ /* allocations done, now commit: */ -+ -+ if (new_r.entries) -+ bch2_write_super(c); -+ -+ /* don't update in memory replicas until changes are persistent */ -+ percpu_down_write(&c->mark_lock); -+ if (new_r.entries) -+ ret = replicas_table_update(c, &new_r); -+ if (new_gc.entries) -+ swap(new_gc, c->replicas_gc); -+ percpu_up_write(&c->mark_lock); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ kfree(new_r.entries); -+ kfree(new_gc.entries); -+ -+ return ret; -+err: -+ bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret)); -+ goto out; -+} -+ -+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) -+{ -+ return likely(bch2_replicas_marked(c, r)) -+ ? 0 : bch2_mark_replicas_slowpath(c, r); -+} -+ -+/* replicas delta list: */ -+ -+int bch2_replicas_delta_list_mark(struct bch_fs *c, -+ struct replicas_delta_list *r) -+{ -+ struct replicas_delta *d = r->d; -+ struct replicas_delta *top = (void *) r->d + r->used; -+ int ret = 0; -+ -+ for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) -+ ret = bch2_mark_replicas(c, &d->r); -+ return ret; -+} -+ -+/* -+ * Old replicas_gc mechanism: only used for journal replicas entries now, should -+ * die at some point: -+ */ -+ -+int bch2_replicas_gc_end(struct bch_fs *c, int ret) -+{ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); -+ if (ret) -+ goto err; -+ -+ ret = replicas_table_update(c, &c->replicas_gc); -+err: -+ kfree(c->replicas_gc.entries); -+ c->replicas_gc.entries = NULL; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i = 0; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ BUG_ON(c->replicas_gc.entries); -+ -+ c->replicas_gc.nr = 0; -+ c->replicas_gc.entry_size = 0; -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) { -+ c->replicas_gc.nr++; -+ c->replicas_gc.entry_size = -+ max_t(unsigned, c->replicas_gc.entry_size, -+ replicas_entry_bytes(e)); -+ } -+ -+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, -+ c->replicas_gc.entry_size, -+ GFP_KERNEL); -+ if (!c->replicas_gc.entries) { -+ mutex_unlock(&c->sb_lock); -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -BCH_ERR_ENOMEM_replicas_gc; -+ } -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) -+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), -+ e, c->replicas_gc.entry_size); -+ -+ bch2_cpu_replicas_sort(&c->replicas_gc); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+/* -+ * New much simpler mechanism for clearing out unneeded replicas entries - drop -+ * replicas entries that have 0 sectors used. -+ * -+ * However, we don't track sector counts for journal usage, so this doesn't drop -+ * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism -+ * is retained for that. -+ */ -+int bch2_replicas_gc2(struct bch_fs *c) -+{ -+ struct bch_replicas_cpu new = { 0 }; -+ unsigned i, nr; -+ int ret = 0; -+ -+ bch2_journal_meta(&c->journal); -+retry: -+ nr = READ_ONCE(c->replicas.nr); -+ new.entry_size = READ_ONCE(c->replicas.entry_size); -+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); -+ if (!new.entries) { -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -BCH_ERR_ENOMEM_replicas_gc; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ if (nr != c->replicas.nr || -+ new.entry_size != c->replicas.entry_size) { -+ percpu_up_write(&c->mark_lock); -+ mutex_unlock(&c->sb_lock); -+ kfree(new.entries); -+ goto retry; -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (e->data_type == BCH_DATA_journal || -+ c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i]) || -+ percpu_u64_get(&c->usage[2]->replicas[i]) || -+ percpu_u64_get(&c->usage[3]->replicas[i])) -+ memcpy(cpu_replicas_entry(&new, new.nr++), -+ e, new.entry_size); -+ } -+ -+ bch2_cpu_replicas_sort(&new); -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new); -+ if (ret) -+ goto err; -+ -+ ret = replicas_table_update(c, &new); -+err: -+ kfree(new.entries); -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_set_usage(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ u64 sectors) -+{ -+ int ret, idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) { -+ struct bch_replicas_cpu n; -+ -+ n = cpu_replicas_add_entry(&c->replicas, r); -+ if (!n.entries) -+ return -BCH_ERR_ENOMEM_cpu_replicas; -+ -+ ret = replicas_table_update(c, &n); -+ if (ret) -+ return ret; -+ -+ kfree(n.entries); -+ -+ idx = bch2_replicas_entry_idx(c, r); -+ BUG_ON(ret < 0); -+ } -+ -+ c->usage_base->replicas[idx] = sectors; -+ -+ return 0; -+} -+ -+/* Replicas tracking - superblock: */ -+ -+static int -+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry *e, *dst; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); -+ if (!cpu_r->entries) -+ return -BCH_ERR_ENOMEM_cpu_replicas; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ dst = cpu_replicas_entry(cpu_r, idx++); -+ memcpy(dst, e, replicas_entry_bytes(e)); -+ bch2_replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+static int -+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry_v0 *e; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ entry_size += sizeof(struct bch_replicas_entry) - -+ sizeof(struct bch_replicas_entry_v0); -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); -+ if (!cpu_r->entries) -+ return -BCH_ERR_ENOMEM_cpu_replicas; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ struct bch_replicas_entry *dst = -+ cpu_replicas_entry(cpu_r, idx++); -+ -+ dst->data_type = e->data_type; -+ dst->nr_devs = e->nr_devs; -+ dst->nr_required = 1; -+ memcpy(dst->devs, e->devs, e->nr_devs); -+ bch2_replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -+{ -+ struct bch_sb_field_replicas *sb_v1; -+ struct bch_sb_field_replicas_v0 *sb_v0; -+ struct bch_replicas_cpu new_r = { 0, 0, NULL }; -+ int ret = 0; -+ -+ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); -+ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); -+ if (ret) -+ return ret; -+ -+ bch2_cpu_replicas_sort(&new_r); -+ -+ percpu_down_write(&c->mark_lock); -+ -+ ret = replicas_table_update(c, &new_r); -+ percpu_up_write(&c->mark_lock); -+ -+ kfree(new_r.entries); -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r; -+ struct bch_replicas_entry_v0 *dst; -+ struct bch_replicas_entry *src; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) -+ bytes += replicas_entry_bytes(src) - 1; -+ -+ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -BCH_ERR_ENOSPC_sb_replicas; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); -+ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ dst->data_type = src->data_type; -+ dst->nr_devs = src->nr_devs; -+ memcpy(dst->devs, src->devs, src->nr_devs); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas *sb_r; -+ struct bch_replicas_entry *dst, *src; -+ bool need_v1 = false; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) { -+ bytes += replicas_entry_bytes(src); -+ if (src->nr_required != 1) -+ need_v1 = true; -+ } -+ -+ if (!need_v1) -+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); -+ -+ sb_r = bch2_sb_resize_replicas(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -BCH_ERR_ENOSPC_sb_replicas; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); -+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ memcpy(dst, src, replicas_entry_bytes(src)); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, -+ struct bch_sb *sb, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ unsigned i, j; -+ -+ sort_cmp_size(cpu_r->entries, -+ cpu_r->nr, -+ cpu_r->entry_size, -+ memcmp, NULL); -+ -+ for (i = 0; i < cpu_r->nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(cpu_r, i); -+ -+ if (e->data_type >= BCH_DATA_NR) { -+ prt_printf(err, "invalid data type in entry "); -+ bch2_replicas_entry_to_text(err, e); -+ return -BCH_ERR_invalid_sb_replicas; -+ } -+ -+ if (!e->nr_devs) { -+ prt_printf(err, "no devices in entry "); -+ bch2_replicas_entry_to_text(err, e); -+ return -BCH_ERR_invalid_sb_replicas; -+ } -+ -+ if (e->nr_required > 1 && -+ e->nr_required >= e->nr_devs) { -+ prt_printf(err, "bad nr_required in entry "); -+ bch2_replicas_entry_to_text(err, e); -+ return -BCH_ERR_invalid_sb_replicas; -+ } -+ -+ for (j = 0; j < e->nr_devs; j++) -+ if (!bch2_dev_exists(sb, mi, e->devs[j])) { -+ prt_printf(err, "invalid device %u in entry ", e->devs[j]); -+ bch2_replicas_entry_to_text(err, e); -+ return -BCH_ERR_invalid_sb_replicas; -+ } -+ -+ if (i + 1 < cpu_r->nr) { -+ struct bch_replicas_entry *n = -+ cpu_replicas_entry(cpu_r, i + 1); -+ -+ BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); -+ -+ if (!memcmp(e, n, cpu_r->entry_size)) { -+ prt_printf(err, "duplicate replicas entry "); -+ bch2_replicas_entry_to_text(err, e); -+ return -BCH_ERR_invalid_sb_replicas; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); -+ struct bch_replicas_cpu cpu_r; -+ int ret; -+ -+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); -+ if (ret) -+ return ret; -+ -+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); -+ kfree(cpu_r.entries); -+ return ret; -+} -+ -+static void bch2_sb_replicas_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *r = field_to_type(f, replicas); -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_replicas_entry(r, e) { -+ if (!first) -+ prt_printf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas = { -+ .validate = bch2_sb_replicas_validate, -+ .to_text = bch2_sb_replicas_to_text, -+}; -+ -+static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_replicas_cpu cpu_r; -+ int ret; -+ -+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); -+ if (ret) -+ return ret; -+ -+ ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); -+ kfree(cpu_r.entries); -+ return ret; -+} -+ -+static void bch2_sb_replicas_v0_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_replicas_entry_v0 *e; -+ bool first = true; -+ -+ for_each_replicas_entry(sb_r, e) { -+ if (!first) -+ prt_printf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_v0_to_text(out, e); -+ } -+ prt_newline(out); -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { -+ .validate = bch2_sb_replicas_v0_validate, -+ .to_text = bch2_sb_replicas_v0_to_text, -+}; -+ -+/* Query replicas: */ -+ -+bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, -+ unsigned flags, bool print) -+{ -+ struct bch_replicas_entry *e; -+ bool ret = true; -+ -+ percpu_down_read(&c->mark_lock); -+ for_each_cpu_replicas_entry(&c->replicas, e) { -+ unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; -+ bool metadata = e->data_type < BCH_DATA_user; -+ -+ if (e->data_type == BCH_DATA_cached) -+ continue; -+ -+ for (i = 0; i < e->nr_devs; i++) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); -+ -+ nr_online += test_bit(e->devs[i], devs.d); -+ nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; -+ } -+ -+ if (nr_failed == e->nr_devs) -+ continue; -+ -+ if (nr_online < e->nr_required) -+ dflags |= metadata -+ ? BCH_FORCE_IF_METADATA_LOST -+ : BCH_FORCE_IF_DATA_LOST; -+ -+ if (nr_online < e->nr_devs) -+ dflags |= metadata -+ ? BCH_FORCE_IF_METADATA_DEGRADED -+ : BCH_FORCE_IF_DATA_DEGRADED; -+ -+ if (dflags & ~flags) { -+ if (print) { -+ struct printbuf buf = PRINTBUF; -+ -+ bch2_replicas_entry_to_text(&buf, e); -+ bch_err(c, "insufficient devices online (%u) for replicas entry %s", -+ nr_online, buf.buf); -+ printbuf_exit(&buf); -+ } -+ ret = false; -+ break; -+ } -+ -+ } -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) -+{ -+ struct bch_sb_field_replicas *replicas; -+ struct bch_sb_field_replicas_v0 *replicas_v0; -+ unsigned i, data_has = 0; -+ -+ replicas = bch2_sb_get_replicas(sb); -+ replicas_v0 = bch2_sb_get_replicas_v0(sb); -+ -+ if (replicas) { -+ struct bch_replicas_entry *r; -+ -+ for_each_replicas_entry(replicas, r) -+ for (i = 0; i < r->nr_devs; i++) -+ if (r->devs[i] == dev) -+ data_has |= 1 << r->data_type; -+ } else if (replicas_v0) { -+ struct bch_replicas_entry_v0 *r; -+ -+ for_each_replicas_entry_v0(replicas_v0, r) -+ for (i = 0; i < r->nr_devs; i++) -+ if (r->devs[i] == dev) -+ data_has |= 1 << r->data_type; -+ } -+ -+ -+ return data_has; -+} -+ -+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned ret; -+ -+ mutex_lock(&c->sb_lock); -+ ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+void bch2_fs_replicas_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ kfree(c->usage_scratch); -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ free_percpu(c->usage[i]); -+ kfree(c->usage_base); -+ kfree(c->replicas.entries); -+ kfree(c->replicas_gc.entries); -+ -+ mempool_exit(&c->replicas_delta_pool); -+} -+ -+int bch2_fs_replicas_init(struct bch_fs *c) -+{ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->replicas_journal_res, -+ reserve_journal_replicas(c, &c->replicas)); -+ -+ return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, -+ REPLICAS_DELTA_LIST_MAX) ?: -+ replicas_table_update(c, &c->replicas); -+} -diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h -new file mode 100644 -index 000000000..4887675a8 ---- /dev/null -+++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REPLICAS_H -+#define _BCACHEFS_REPLICAS_H -+ -+#include "bkey.h" -+#include "eytzinger.h" -+#include "replicas_types.h" -+ -+void bch2_replicas_entry_sort(struct bch_replicas_entry *); -+void bch2_replicas_entry_to_text(struct printbuf *, -+ struct bch_replicas_entry *); -+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -+ -+static inline struct bch_replicas_entry * -+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -+{ -+ return (void *) r->entries + r->entry_size * i; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *, -+ enum bch_data_type, -+ struct bch_devs_list); -+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); -+int bch2_mark_replicas(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+static inline struct replicas_delta * -+replicas_delta_next(struct replicas_delta *d) -+{ -+ return (void *) d + replicas_entry_bytes(&d->r) + 8; -+} -+ -+int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -+ -+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, -+ unsigned dev) -+{ -+ e->data_type = BCH_DATA_cached; -+ e->nr_devs = 1; -+ e->nr_required = 1; -+ e->devs[0] = dev; -+} -+ -+bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, -+ unsigned, bool); -+ -+unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); -+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -+ -+int bch2_replicas_gc_end(struct bch_fs *, int); -+int bch2_replicas_gc_start(struct bch_fs *, unsigned); -+int bch2_replicas_gc2(struct bch_fs *); -+ -+int bch2_replicas_set_usage(struct bch_fs *, -+ struct bch_replicas_entry *, -+ u64); -+ -+#define for_each_cpu_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ -+ _i = (void *) (_i) + (_r)->entry_size) -+ -+/* iterate over superblock replicas - used by userspace tools: */ -+ -+#define replicas_entry_next(_i) \ -+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) -+ -+#define for_each_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+#define for_each_replicas_entry_v0(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; -+ -+void bch2_fs_replicas_exit(struct bch_fs *); -+int bch2_fs_replicas_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REPLICAS_H */ -diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h -new file mode 100644 -index 000000000..5cfff489b ---- /dev/null -+++ b/fs/bcachefs/replicas_types.h -@@ -0,0 +1,27 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REPLICAS_TYPES_H -+#define _BCACHEFS_REPLICAS_TYPES_H -+ -+struct bch_replicas_cpu { -+ unsigned nr; -+ unsigned entry_size; -+ struct bch_replicas_entry *entries; -+}; -+ -+struct replicas_delta { -+ s64 delta; -+ struct bch_replicas_entry r; -+} __packed; -+ -+struct replicas_delta_list { -+ unsigned size; -+ unsigned used; -+ -+ struct {} memset_start; -+ u64 nr_inodes; -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ struct {} memset_end; -+ struct replicas_delta d[0]; -+}; -+ -+#endif /* _BCACHEFS_REPLICAS_TYPES_H */ -diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c -new file mode 100644 -index 000000000..a3695e56a ---- /dev/null -+++ b/fs/bcachefs/sb-clean.c -@@ -0,0 +1,395 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "error.h" -+#include "journal_io.h" -+#include "replicas.h" -+#include "sb-clean.h" -+#include "super-io.h" -+ -+/* -+ * BCH_SB_FIELD_clean: -+ * -+ * Btree roots, and a few other things, are recovered from the journal after an -+ * unclean shutdown - but after a clean shutdown, to avoid having to read the -+ * journal, we can store them in the superblock. -+ * -+ * bch_sb_field_clean simply contains a list of journal entries, stored exactly -+ * as they would be in the journal: -+ */ -+ -+int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, -+ int write) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ for (entry = clean->start; -+ entry < (struct jset_entry *) vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = bch2_journal_entry_validate(c, NULL, entry, -+ le16_to_cpu(c->disk_sb.sb->version), -+ BCH_SB_BIG_ENDIAN(c->disk_sb.sb), -+ write); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static struct bkey_i *btree_root_find(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct jset *j, -+ enum btree_id id, unsigned *level) -+{ -+ struct bkey_i *k; -+ struct jset_entry *entry, *start, *end; -+ -+ if (clean) { -+ start = clean->start; -+ end = vstruct_end(&clean->field); -+ } else { -+ start = j->start; -+ end = vstruct_last(j); -+ } -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root && -+ entry->btree_id == id) -+ goto found; -+ -+ return NULL; -+found: -+ if (!entry->u64s) -+ return ERR_PTR(-EINVAL); -+ -+ k = entry->start; -+ *level = entry->level; -+ return k; -+} -+ -+int bch2_verify_superblock_clean(struct bch_fs *c, -+ struct bch_sb_field_clean **cleanp, -+ struct jset *j) -+{ -+ unsigned i; -+ struct bch_sb_field_clean *clean = *cleanp; -+ struct printbuf buf1 = PRINTBUF; -+ struct printbuf buf2 = PRINTBUF; -+ int ret = 0; -+ -+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, -+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", -+ le64_to_cpu(clean->journal_seq), -+ le64_to_cpu(j->seq))) { -+ kfree(clean); -+ *cleanp = NULL; -+ return 0; -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct bkey_i *k1, *k2; -+ unsigned l1 = 0, l2 = 0; -+ -+ k1 = btree_root_find(c, clean, NULL, i, &l1); -+ k2 = btree_root_find(c, NULL, j, i, &l2); -+ -+ if (!k1 && !k2) -+ continue; -+ -+ printbuf_reset(&buf1); -+ printbuf_reset(&buf2); -+ -+ if (k1) -+ bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); -+ else -+ prt_printf(&buf1, "(none)"); -+ -+ if (k2) -+ bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); -+ else -+ prt_printf(&buf2, "(none)"); -+ -+ mustfix_fsck_err_on(!k1 || !k2 || -+ IS_ERR(k1) || -+ IS_ERR(k2) || -+ k1->k.u64s != k2->k.u64s || -+ memcmp(k1, k2, bkey_bytes(&k1->k)) || -+ l1 != l2, c, -+ "superblock btree root %u doesn't match journal after clean shutdown\n" -+ "sb: l=%u %s\n" -+ "journal: l=%u %s\n", i, -+ l1, buf1.buf, -+ l2, buf2.buf); -+ } -+fsck_err: -+ printbuf_exit(&buf2); -+ printbuf_exit(&buf1); -+ return ret; -+} -+ -+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean, *sb_clean; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); -+ -+ if (fsck_err_on(!sb_clean, c, -+ "superblock marked clean but clean section not present")) { -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ mutex_unlock(&c->sb_lock); -+ return NULL; -+ } -+ -+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), -+ GFP_KERNEL); -+ if (!clean) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); -+ } -+ -+ ret = bch2_sb_clean_validate_late(c, clean, READ); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+ } -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return clean; -+fsck_err: -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+} -+ -+static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -+{ -+ struct jset_entry *entry = *end; -+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); -+ -+ memset(entry, 0, u64s * sizeof(u64)); -+ /* -+ * The u64s field counts from the start of data, ignoring the shared -+ * fields. -+ */ -+ entry->u64s = cpu_to_le16(u64s - 1); -+ -+ *end = vstruct_next(*end); -+ return entry; -+} -+ -+void bch2_journal_super_entries_add_common(struct bch_fs *c, -+ struct jset_entry **end, -+ u64 journal_seq) -+{ -+ struct bch_dev *ca; -+ unsigned i, dev; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ if (!journal_seq) { -+ for (i = 0; i < ARRAY_SIZE(c->usage); i++) -+ bch2_fs_usage_acc_to_base(c, i); -+ } else { -+ bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_inodes; -+ u->v = cpu_to_le64(c->usage_base->nr_inodes); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_key_version; -+ u->v = cpu_to_le64(atomic64_read(&c->key_version)); -+ } -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ struct jset_entry_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u)), -+ struct jset_entry_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = BCH_FS_USAGE_reserved; -+ u->entry.level = i; -+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct jset_entry_data_usage *u = -+ container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), -+ struct jset_entry_data_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_data_usage; -+ u->v = cpu_to_le64(c->usage_base->replicas[i]); -+ unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), -+ "embedded variable length struct"); -+ } -+ -+ for_each_member_device(ca, c, dev) { -+ unsigned b = sizeof(struct jset_entry_dev_usage) + -+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; -+ struct jset_entry_dev_usage *u = -+ container_of(jset_entry_init(end, b), -+ struct jset_entry_dev_usage, entry); -+ -+ u->entry.type = BCH_JSET_ENTRY_dev_usage; -+ u->dev = cpu_to_le32(dev); -+ u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); -+ u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); -+ u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); -+ } -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ for (i = 0; i < 2; i++) { -+ struct jset_entry_clock *clock = -+ container_of(jset_entry_init(end, sizeof(*clock)), -+ struct jset_entry_clock, entry); -+ -+ clock->entry.type = BCH_JSET_ENTRY_clock; -+ clock->rw = i; -+ clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); -+ } -+} -+ -+static int bch2_sb_clean_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) { -+ prt_printf(err, "wrong size (got %zu should be %zu)", -+ vstruct_bytes(&clean->field), sizeof(*clean)); -+ return -BCH_ERR_invalid_sb_clean; -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ struct jset_entry *entry; -+ -+ prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); -+ prt_newline(out); -+ prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); -+ prt_newline(out); -+ -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ if (entry->type == BCH_JSET_ENTRY_btree_keys && -+ !entry->u64s) -+ continue; -+ -+ bch2_journal_entry_to_text(out, NULL, entry); -+ prt_newline(out); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_clean_validate, -+ .to_text = bch2_sb_clean_to_text, -+}; -+ -+int bch2_fs_mark_dirty(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Unconditionally write superblock, to verify it hasn't changed before -+ * we go rw: -+ */ -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_sb_maybe_downgrade(c); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); -+ -+ ret = bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+void bch2_fs_mark_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *sb_clean; -+ struct jset_entry *entry; -+ unsigned u64s; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_CLEAN(c->disk_sb.sb)) -+ goto out; -+ -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); -+ -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); -+ c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); -+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); -+ c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); -+ -+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; -+ -+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); -+ if (!sb_clean) { -+ bch_err(c, "error resizing superblock while setting filesystem clean"); -+ goto out; -+ } -+ -+ sb_clean->flags = 0; -+ sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); -+ -+ /* Trying to catch outstanding bug: */ -+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); -+ -+ entry = sb_clean->start; -+ bch2_journal_super_entries_add_common(c, &entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); -+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); -+ -+ memset(entry, 0, -+ vstruct_end(&sb_clean->field) - (void *) entry); -+ -+ /* -+ * this should be in the write path, and we should be validating every -+ * superblock section: -+ */ -+ ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); -+ if (ret) { -+ bch_err(c, "error writing marking filesystem clean: validate error"); -+ goto out; -+ } -+ -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/sb-clean.h b/fs/bcachefs/sb-clean.h -new file mode 100644 -index 000000000..71caef281 ---- /dev/null -+++ b/fs/bcachefs/sb-clean.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SB_CLEAN_H -+#define _BCACHEFS_SB_CLEAN_H -+ -+int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); -+int bch2_verify_superblock_clean(struct bch_fs *, struct bch_sb_field_clean **, -+ struct jset *); -+struct bch_sb_field_clean *bch2_read_superblock_clean(struct bch_fs *); -+void bch2_journal_super_entries_add_common(struct bch_fs *, struct jset_entry **, u64); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_clean; -+ -+int bch2_fs_mark_dirty(struct bch_fs *); -+void bch2_fs_mark_clean(struct bch_fs *); -+ -+#endif /* _BCACHEFS_SB_CLEAN_H */ -diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c -new file mode 100644 -index 000000000..16a2b3389 ---- /dev/null -+++ b/fs/bcachefs/sb-members.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "disk_groups.h" -+#include "replicas.h" -+#include "sb-members.h" -+#include "super-io.h" -+ -+/* Code for bch_sb_field_members: */ -+ -+static int bch2_sb_members_validate(struct bch_sb *sb, -+ struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ unsigned i; -+ -+ if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) { -+ prt_printf(err, "too many devices for section size"); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) { -+ prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", -+ i, le64_to_cpu(m->nbuckets), LONG_MAX); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { -+ prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", -+ i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) { -+ prt_printf(err, "device %u: bucket size %u smaller than block size %u", -+ i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ -+ if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) { -+ prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", -+ i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); -+ return -BCH_ERR_invalid_sb_members; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); -+ unsigned i; -+ -+ for (i = 0; i < sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ unsigned data_have = bch2_sb_dev_has_data(sb, i); -+ u64 bucket_size = le16_to_cpu(m->bucket_size); -+ u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ prt_printf(out, "Device:"); -+ prt_tab(out); -+ prt_printf(out, "%u", i); -+ prt_newline(out); -+ -+ printbuf_indent_add(out, 2); -+ -+ prt_printf(out, "UUID:"); -+ prt_tab(out); -+ pr_uuid(out, m->uuid.b); -+ prt_newline(out); -+ -+ prt_printf(out, "Size:"); -+ prt_tab(out); -+ prt_units_u64(out, device_size << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "Bucket size:"); -+ prt_tab(out); -+ prt_units_u64(out, bucket_size << 9); -+ prt_newline(out); -+ -+ prt_printf(out, "First bucket:"); -+ prt_tab(out); -+ prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); -+ prt_newline(out); -+ -+ prt_printf(out, "Buckets:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); -+ prt_newline(out); -+ -+ prt_printf(out, "Last mount:"); -+ prt_tab(out); -+ if (m->last_mount) -+ pr_time(out, le64_to_cpu(m->last_mount)); -+ else -+ prt_printf(out, "(never)"); -+ prt_newline(out); -+ -+ prt_printf(out, "State:"); -+ prt_tab(out); -+ prt_printf(out, "%s", -+ BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR -+ ? bch2_member_states[BCH_MEMBER_STATE(m)] -+ : "unknown"); -+ prt_newline(out); -+ -+ prt_printf(out, "Label:"); -+ prt_tab(out); -+ if (BCH_MEMBER_GROUP(m)) { -+ unsigned idx = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (idx < disk_groups_nr(gi)) -+ prt_printf(out, "%s (%u)", -+ gi->entries[idx].label, idx); -+ else -+ prt_printf(out, "(bad disk labels section)"); -+ } else { -+ prt_printf(out, "(none)"); -+ } -+ prt_newline(out); -+ -+ prt_printf(out, "Data allowed:"); -+ prt_tab(out); -+ if (BCH_MEMBER_DATA_ALLOWED(m)) -+ prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); -+ else -+ prt_printf(out, "(none)"); -+ prt_newline(out); -+ -+ prt_printf(out, "Has data:"); -+ prt_tab(out); -+ if (data_have) -+ prt_bitflags(out, bch2_data_types, data_have); -+ else -+ prt_printf(out, "(none)"); -+ prt_newline(out); -+ -+ prt_printf(out, "Discard:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); -+ prt_newline(out); -+ -+ prt_printf(out, "Freespace initialized:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); -+ prt_newline(out); -+ -+ printbuf_indent_sub(out, 2); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_members_validate, -+ .to_text = bch2_sb_members_to_text, -+}; -diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h -new file mode 100644 -index 000000000..34e1cf604 ---- /dev/null -+++ b/fs/bcachefs/sb-members.h -@@ -0,0 +1,176 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SB_MEMBERS_H -+#define _BCACHEFS_SB_MEMBERS_H -+ -+static inline bool bch2_dev_is_online(struct bch_dev *ca) -+{ -+ return !percpu_ref_is_zero(&ca->io_ref); -+} -+ -+static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+{ -+ return bch2_dev_is_online(ca) && -+ ca->mi.state != BCH_MEMBER_STATE_failed; -+} -+ -+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -+{ -+ if (!percpu_ref_tryget(&ca->io_ref)) -+ return false; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_rw || -+ (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) -+ return true; -+ -+ percpu_ref_put(&ca->io_ref); -+ return false; -+} -+ -+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -+{ -+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -+} -+ -+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs.nr; i++) -+ if (devs.devs[i] == dev) -+ return true; -+ -+ return false; -+} -+ -+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs->nr; i++) -+ if (devs->devs[i] == dev) { -+ array_remove_item(devs->devs, devs->nr, i); -+ return; -+ } -+} -+ -+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ if (!bch2_dev_list_has_dev(*devs, dev)) { -+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); -+ devs->devs[devs->nr++] = dev; -+ } -+} -+ -+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -+{ -+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -+} -+ -+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, -+ const struct bch_devs_mask *mask) -+{ -+ struct bch_dev *ca = NULL; -+ -+ while ((*iter = mask -+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) -+ : *iter) < c->sb.nr_devices && -+ !(ca = rcu_dereference_check(c->devs[*iter], -+ lockdep_is_held(&c->state_lock)))) -+ (*iter)++; -+ -+ return ca; -+} -+ -+#define for_each_member_device_rcu(ca, c, iter, mask) \ -+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ if ((ca = __bch2_next_dev(c, iter, NULL))) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* -+ * If you break early, you must drop your ref on the current device -+ */ -+#define for_each_member_device(ca, c, iter) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_dev(c, &(iter))); \ -+ percpu_ref_put(&ca->ref), (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, -+ unsigned *iter, -+ int state_mask) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ while ((ca = __bch2_next_dev(c, iter, NULL)) && -+ (!((1 << ca->mi.state) & state_mask) || -+ !percpu_ref_tryget(&ca->io_ref))) -+ (*iter)++; -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+#define __for_each_online_member(ca, c, iter, state_mask) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ -+ percpu_ref_put(&ca->io_ref), (iter)++) -+ -+#define for_each_online_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, ~0) -+ -+#define for_each_rw_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) -+ -+#define for_each_readable_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, \ -+ (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) -+ -+/* -+ * If a key exists that references a device, the device won't be going away and -+ * we can omit rcu_read_lock(): -+ */ -+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_check(c->devs[idx], 1); -+} -+ -+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_protected(c->devs[idx], -+ lockdep_is_held(&c->sb_lock) || -+ lockdep_is_held(&c->state_lock)); -+} -+ -+/* XXX kill, move to struct bch_fs */ -+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -+{ -+ struct bch_devs_mask devs; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ memset(&devs, 0, sizeof(devs)); -+ for_each_online_member(ca, c, i) -+ __set_bit(ca->dev_idx, devs.d); -+ return devs; -+} -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_members; -+ -+#endif /* _BCACHEFS_SB_MEMBERS_H */ -diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h -new file mode 100644 -index 000000000..c1860d816 ---- /dev/null -+++ b/fs/bcachefs/seqmutex.h -@@ -0,0 +1,48 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SEQMUTEX_H -+#define _BCACHEFS_SEQMUTEX_H -+ -+#include -+ -+struct seqmutex { -+ struct mutex lock; -+ u32 seq; -+}; -+ -+#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) -+ -+static inline bool seqmutex_trylock(struct seqmutex *lock) -+{ -+ return mutex_trylock(&lock->lock); -+} -+ -+static inline void seqmutex_lock(struct seqmutex *lock) -+{ -+ mutex_lock(&lock->lock); -+} -+ -+static inline void seqmutex_unlock(struct seqmutex *lock) -+{ -+ lock->seq++; -+ mutex_unlock(&lock->lock); -+} -+ -+static inline u32 seqmutex_seq(struct seqmutex *lock) -+{ -+ return lock->seq; -+} -+ -+static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) -+{ -+ if (lock->seq != seq || !mutex_trylock(&lock->lock)) -+ return false; -+ -+ if (lock->seq != seq) { -+ mutex_unlock(&lock->lock); -+ return false; -+ } -+ -+ return true; -+} -+ -+#endif /* _BCACHEFS_SEQMUTEX_H */ -diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c -new file mode 100644 -index 000000000..dc1a27cc3 ---- /dev/null -+++ b/fs/bcachefs/siphash.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: BSD-3-Clause -+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ -+ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ */ -+ -+/* -+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d -+ * are the number of compression rounds and the number of finalization rounds. -+ * A compression round is identical to a finalization round and this round -+ * function is called SipRound. Given a 128-bit key k and a (possibly empty) -+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). -+ * -+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, -+ * by Jean-Philippe Aumasson and Daniel J. Bernstein, -+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa -+ * https://131002.net/siphash/siphash.pdf -+ * https://131002.net/siphash/ -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include "siphash.h" -+ -+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -+{ -+ while (rounds--) { -+ ctx->v[0] += ctx->v[1]; -+ ctx->v[2] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 13); -+ ctx->v[3] = rol64(ctx->v[3], 16); -+ -+ ctx->v[1] ^= ctx->v[0]; -+ ctx->v[3] ^= ctx->v[2]; -+ ctx->v[0] = rol64(ctx->v[0], 32); -+ -+ ctx->v[2] += ctx->v[1]; -+ ctx->v[0] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 17); -+ ctx->v[3] = rol64(ctx->v[3], 21); -+ -+ ctx->v[1] ^= ctx->v[2]; -+ ctx->v[3] ^= ctx->v[0]; -+ ctx->v[2] = rol64(ctx->v[2], 32); -+ } -+} -+ -+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -+{ -+ u64 m = get_unaligned_le64(ptr); -+ -+ ctx->v[3] ^= m; -+ SipHash_Rounds(ctx, rounds); -+ ctx->v[0] ^= m; -+} -+ -+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -+{ -+ u64 k0, k1; -+ -+ k0 = le64_to_cpu(key->k0); -+ k1 = le64_to_cpu(key->k1); -+ -+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; -+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; -+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; -+ ctx->v[3] = 0x7465646279746573ULL ^ k1; -+ -+ memset(ctx->buf, 0, sizeof(ctx->buf)); -+ ctx->bytes = 0; -+} -+ -+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, -+ const void *src, size_t len) -+{ -+ const u8 *ptr = src; -+ size_t left, used; -+ -+ if (len == 0) -+ return; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ ctx->bytes += len; -+ -+ if (used > 0) { -+ left = sizeof(ctx->buf) - used; -+ -+ if (len >= left) { -+ memcpy(&ctx->buf[used], ptr, left); -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ len -= left; -+ ptr += left; -+ } else { -+ memcpy(&ctx->buf[used], ptr, len); -+ return; -+ } -+ } -+ -+ while (len >= sizeof(ctx->buf)) { -+ SipHash_CRounds(ctx, ptr, rc); -+ len -= sizeof(ctx->buf); -+ ptr += sizeof(ctx->buf); -+ } -+ -+ if (len > 0) -+ memcpy(&ctx->buf[used], ptr, len); -+} -+ -+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ -+ r = SipHash_End(ctx, rc, rf); -+ -+ *((__le64 *) dst) = cpu_to_le64(r); -+} -+ -+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ size_t left, used; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ left = sizeof(ctx->buf) - used; -+ memset(&ctx->buf[used], 0, left - 1); -+ ctx->buf[7] = ctx->bytes; -+ -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ ctx->v[2] ^= 0xff; -+ SipHash_Rounds(ctx, rf); -+ -+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); -+ memset(ctx, 0, sizeof(*ctx)); -+ return r; -+} -+ -+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -+{ -+ SIPHASH_CTX ctx; -+ -+ SipHash_Init(&ctx, key); -+ SipHash_Update(&ctx, rc, rf, src, len); -+ return SipHash_End(&ctx, rc, rf); -+} -diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h -new file mode 100644 -index 000000000..3dfaf34a4 ---- /dev/null -+++ b/fs/bcachefs/siphash.h -@@ -0,0 +1,87 @@ -+/* SPDX-License-Identifier: BSD-3-Clause */ -+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * $FreeBSD$ -+ */ -+ -+/* -+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) -+ * optimized for speed on short messages returning a 64bit hash/digest value. -+ * -+ * The number of rounds is defined during the initialization: -+ * SipHash24_Init() for the fast and resonable strong version -+ * SipHash48_Init() for the strong version (half as fast) -+ * -+ * struct SIPHASH_CTX ctx; -+ * SipHash24_Init(&ctx); -+ * SipHash_SetKey(&ctx, "16bytes long key"); -+ * SipHash_Update(&ctx, pointer_to_string, length_of_string); -+ * SipHash_Final(output, &ctx); -+ */ -+ -+#ifndef _SIPHASH_H_ -+#define _SIPHASH_H_ -+ -+#include -+ -+#define SIPHASH_BLOCK_LENGTH 8 -+#define SIPHASH_KEY_LENGTH 16 -+#define SIPHASH_DIGEST_LENGTH 8 -+ -+typedef struct _SIPHASH_CTX { -+ u64 v[4]; -+ u8 buf[SIPHASH_BLOCK_LENGTH]; -+ u32 bytes; -+} SIPHASH_CTX; -+ -+typedef struct { -+ __le64 k0; -+ __le64 k1; -+} SIPHASH_KEY; -+ -+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -+u64 SipHash_End(SIPHASH_CTX *, int, int); -+void SipHash_Final(void *, SIPHASH_CTX *, int, int); -+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); -+ -+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -+#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) -+ -+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -+#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) -+ -+#endif /* _SIPHASH_H_ */ -diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c -new file mode 100644 -index 000000000..14cffa68d ---- /dev/null -+++ b/fs/bcachefs/six.c -@@ -0,0 +1,918 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include "six.h" -+ -+#ifdef DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) do {} while (0) -+#endif -+ -+#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) -+#define six_release(l, ip) lock_release(l, ip) -+ -+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); -+ -+#define SIX_LOCK_HELD_read_OFFSET 0 -+#define SIX_LOCK_HELD_read ~(~0U << 26) -+#define SIX_LOCK_HELD_intent (1U << 26) -+#define SIX_LOCK_HELD_write (1U << 27) -+#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) -+#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) -+#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) -+#define SIX_LOCK_NOSPIN (1U << 31) -+ -+struct six_lock_vals { -+ /* Value we add to the lock in order to take the lock: */ -+ u32 lock_val; -+ -+ /* If the lock has this value (used as a mask), taking the lock fails: */ -+ u32 lock_fail; -+ -+ /* Mask that indicates lock is held for this type: */ -+ u32 held_mask; -+ -+ /* Waitlist we wakeup when releasing the lock: */ -+ enum six_lock_type unlock_wakeup; -+}; -+ -+static const struct six_lock_vals l[] = { -+ [SIX_LOCK_read] = { -+ .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, -+ .lock_fail = SIX_LOCK_HELD_write, -+ .held_mask = SIX_LOCK_HELD_read, -+ .unlock_wakeup = SIX_LOCK_write, -+ }, -+ [SIX_LOCK_intent] = { -+ .lock_val = SIX_LOCK_HELD_intent, -+ .lock_fail = SIX_LOCK_HELD_intent, -+ .held_mask = SIX_LOCK_HELD_intent, -+ .unlock_wakeup = SIX_LOCK_intent, -+ }, -+ [SIX_LOCK_write] = { -+ .lock_val = SIX_LOCK_HELD_write, -+ .lock_fail = SIX_LOCK_HELD_read, -+ .held_mask = SIX_LOCK_HELD_write, -+ .unlock_wakeup = SIX_LOCK_read, -+ }, -+}; -+ -+static inline void six_set_bitmask(struct six_lock *lock, u32 mask) -+{ -+ if ((atomic_read(&lock->state) & mask) != mask) -+ atomic_or(mask, &lock->state); -+} -+ -+static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) -+{ -+ if (atomic_read(&lock->state) & mask) -+ atomic_and(~mask, &lock->state); -+} -+ -+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, -+ u32 old, struct task_struct *owner) -+{ -+ if (type != SIX_LOCK_intent) -+ return; -+ -+ if (!(old & SIX_LOCK_HELD_intent)) { -+ EBUG_ON(lock->owner); -+ lock->owner = owner; -+ } else { -+ EBUG_ON(lock->owner != current); -+ } -+} -+ -+static inline unsigned pcpu_read_count(struct six_lock *lock) -+{ -+ unsigned read_count = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ read_count += *per_cpu_ptr(lock->readers, cpu); -+ return read_count; -+} -+ -+/* -+ * __do_six_trylock() - main trylock routine -+ * -+ * Returns 1 on success, 0 on failure -+ * -+ * In percpu reader mode, a failed trylock may cause a spurious trylock failure -+ * for anoter thread taking the competing lock type, and we may havve to do a -+ * wakeup: when a wakeup is required, we return -1 - wakeup_type. -+ */ -+static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, -+ struct task_struct *task, bool try) -+{ -+ int ret; -+ u32 old; -+ -+ EBUG_ON(type == SIX_LOCK_write && lock->owner != task); -+ EBUG_ON(type == SIX_LOCK_write && -+ (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); -+ -+ /* -+ * Percpu reader mode: -+ * -+ * The basic idea behind this algorithm is that you can implement a lock -+ * between two threads without any atomics, just memory barriers: -+ * -+ * For two threads you'll need two variables, one variable for "thread a -+ * has the lock" and another for "thread b has the lock". -+ * -+ * To take the lock, a thread sets its variable indicating that it holds -+ * the lock, then issues a full memory barrier, then reads from the -+ * other thread's variable to check if the other thread thinks it has -+ * the lock. If we raced, we backoff and retry/sleep. -+ * -+ * Failure to take the lock may cause a spurious trylock failure in -+ * another thread, because we temporarily set the lock to indicate that -+ * we held it. This would be a problem for a thread in six_lock(), when -+ * they are calling trylock after adding themself to the waitlist and -+ * prior to sleeping. -+ * -+ * Therefore, if we fail to get the lock, and there were waiters of the -+ * type we conflict with, we will have to issue a wakeup. -+ * -+ * Since we may be called under wait_lock (and by the wakeup code -+ * itself), we return that the wakeup has to be done instead of doing it -+ * here. -+ */ -+ if (type == SIX_LOCK_read && lock->readers) { -+ preempt_disable(); -+ this_cpu_inc(*lock->readers); /* signal that we own lock */ -+ -+ smp_mb(); -+ -+ old = atomic_read(&lock->state); -+ ret = !(old & l[type].lock_fail); -+ -+ this_cpu_sub(*lock->readers, !ret); -+ preempt_enable(); -+ -+ if (!ret && (old & SIX_LOCK_WAITING_write)) -+ ret = -1 - SIX_LOCK_write; -+ } else if (type == SIX_LOCK_write && lock->readers) { -+ if (try) { -+ atomic_add(SIX_LOCK_HELD_write, &lock->state); -+ smp_mb__after_atomic(); -+ } -+ -+ ret = !pcpu_read_count(lock); -+ -+ if (try && !ret) { -+ old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); -+ if (old & SIX_LOCK_WAITING_read) -+ ret = -1 - SIX_LOCK_read; -+ } -+ } else { -+ old = atomic_read(&lock->state); -+ do { -+ ret = !(old & l[type].lock_fail); -+ if (!ret || (type == SIX_LOCK_write && !try)) { -+ smp_mb(); -+ break; -+ } -+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); -+ -+ EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); -+ } -+ -+ if (ret > 0) -+ six_set_owner(lock, type, old, task); -+ -+ EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && -+ (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); -+ -+ return ret; -+} -+ -+static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) -+{ -+ struct six_lock_waiter *w, *next; -+ struct task_struct *task; -+ bool saw_one; -+ int ret; -+again: -+ ret = 0; -+ saw_one = false; -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry_safe(w, next, &lock->wait_list, list) { -+ if (w->lock_want != lock_type) -+ continue; -+ -+ if (saw_one && lock_type != SIX_LOCK_read) -+ goto unlock; -+ saw_one = true; -+ -+ ret = __do_six_trylock(lock, lock_type, w->task, false); -+ if (ret <= 0) -+ goto unlock; -+ -+ /* -+ * Similar to percpu_rwsem_wake_function(), we need to guard -+ * against the wakee noticing w->lock_acquired, returning, and -+ * then exiting before we do the wakeup: -+ */ -+ task = get_task_struct(w->task); -+ __list_del(w->list.prev, w->list.next); -+ /* -+ * The release barrier here ensures the ordering of the -+ * __list_del before setting w->lock_acquired; @w is on the -+ * stack of the thread doing the waiting and will be reused -+ * after it sees w->lock_acquired with no other locking: -+ * pairs with smp_load_acquire() in six_lock_slowpath() -+ */ -+ smp_store_release(&w->lock_acquired, true); -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+ -+ six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); -+unlock: -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (ret < 0) { -+ lock_type = -ret - 1; -+ goto again; -+ } -+} -+ -+__always_inline -+static void six_lock_wakeup(struct six_lock *lock, u32 state, -+ enum six_lock_type lock_type) -+{ -+ if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) -+ return; -+ -+ if (!(state & (SIX_LOCK_WAITING_read << lock_type))) -+ return; -+ -+ __six_lock_wakeup(lock, lock_type); -+} -+ -+__always_inline -+static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) -+{ -+ int ret; -+ -+ ret = __do_six_trylock(lock, type, current, try); -+ if (ret < 0) -+ __six_lock_wakeup(lock, -ret - 1); -+ -+ return ret > 0; -+} -+ -+/** -+ * six_trylock_ip - attempt to take a six lock without blocking -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: true on success, false on failure. -+ */ -+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -+{ -+ if (!do_six_trylock(lock, type, true)) -+ return false; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_trylock_ip); -+ -+/** -+ * six_relock_ip - attempt to re-take a lock that was held previously -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @seq: lock sequence number obtained from six_lock_seq() while lock was -+ * held previously -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: true on success, false on failure. -+ */ -+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq, unsigned long ip) -+{ -+ if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) -+ return false; -+ -+ if (six_lock_seq(lock) != seq) { -+ six_unlock_ip(lock, type, ip); -+ return false; -+ } -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_relock_ip); -+ -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER -+ -+static inline bool six_can_spin_on_owner(struct six_lock *lock) -+{ -+ struct task_struct *owner; -+ bool ret; -+ -+ if (need_resched()) -+ return false; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(lock->owner); -+ ret = !owner || owner_on_cpu(owner); -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_spin_on_owner(struct six_lock *lock, -+ struct task_struct *owner, -+ u64 end_time) -+{ -+ bool ret = true; -+ unsigned loop = 0; -+ -+ rcu_read_lock(); -+ while (lock->owner == owner) { -+ /* -+ * Ensure we emit the owner->on_cpu, dereference _after_ -+ * checking lock->owner still matches owner. If that fails, -+ * owner might point to freed memory. If it still matches, -+ * the rcu_read_lock() ensures the memory stays valid. -+ */ -+ barrier(); -+ -+ if (!owner_on_cpu(owner) || need_resched()) { -+ ret = false; -+ break; -+ } -+ -+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { -+ six_set_bitmask(lock, SIX_LOCK_NOSPIN); -+ ret = false; -+ break; -+ } -+ -+ cpu_relax(); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ struct task_struct *task = current; -+ u64 end_time; -+ -+ if (type == SIX_LOCK_write) -+ return false; -+ -+ preempt_disable(); -+ if (!six_can_spin_on_owner(lock)) -+ goto fail; -+ -+ if (!osq_lock(&lock->osq)) -+ goto fail; -+ -+ end_time = sched_clock() + 10 * NSEC_PER_USEC; -+ -+ while (1) { -+ struct task_struct *owner; -+ -+ /* -+ * If there's an owner, wait for it to either -+ * release the lock or go to sleep. -+ */ -+ owner = READ_ONCE(lock->owner); -+ if (owner && !six_spin_on_owner(lock, owner, end_time)) -+ break; -+ -+ if (do_six_trylock(lock, type, false)) { -+ osq_unlock(&lock->osq); -+ preempt_enable(); -+ return true; -+ } -+ -+ /* -+ * When there's no owner, we might have preempted between the -+ * owner acquiring the lock and setting the owner field. If -+ * we're an RT task that will live-lock because we won't let -+ * the owner complete. -+ */ -+ if (!owner && (need_resched() || rt_task(task))) -+ break; -+ -+ /* -+ * The cpu_relax() call is a compiler barrier which forces -+ * everything in this loop to be re-loaded. We don't need -+ * memory barriers as we'll eventually observe the right -+ * values at the cost of a few extra spins. -+ */ -+ cpu_relax(); -+ } -+ -+ osq_unlock(&lock->osq); -+fail: -+ preempt_enable(); -+ -+ /* -+ * If we fell out of the spin path because of need_resched(), -+ * reschedule now, before we try-lock again. This avoids getting -+ * scheduled out right after we obtained the lock. -+ */ -+ if (need_resched()) -+ schedule(); -+ -+ return false; -+} -+ -+#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ return false; -+} -+ -+#endif -+ -+noinline -+static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ int ret = 0; -+ -+ if (type == SIX_LOCK_write) { -+ EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); -+ atomic_add(SIX_LOCK_HELD_write, &lock->state); -+ smp_mb__after_atomic(); -+ } -+ -+ trace_contention_begin(lock, 0); -+ lock_contended(&lock->dep_map, ip); -+ -+ if (six_optimistic_spin(lock, type)) -+ goto out; -+ -+ wait->task = current; -+ wait->lock_want = type; -+ wait->lock_acquired = false; -+ -+ raw_spin_lock(&lock->wait_lock); -+ six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); -+ /* -+ * Retry taking the lock after taking waitlist lock, in case we raced -+ * with an unlock: -+ */ -+ ret = __do_six_trylock(lock, type, current, false); -+ if (ret <= 0) { -+ wait->start_time = local_clock(); -+ -+ if (!list_empty(&lock->wait_list)) { -+ struct six_lock_waiter *last = -+ list_last_entry(&lock->wait_list, -+ struct six_lock_waiter, list); -+ -+ if (time_before_eq64(wait->start_time, last->start_time)) -+ wait->start_time = last->start_time + 1; -+ } -+ -+ list_add_tail(&wait->list, &lock->wait_list); -+ } -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (unlikely(ret > 0)) { -+ ret = 0; -+ goto out; -+ } -+ -+ if (unlikely(ret < 0)) { -+ __six_lock_wakeup(lock, -ret - 1); -+ ret = 0; -+ } -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ -+ /* -+ * Ensures that writes to the waitlist entry happen after we see -+ * wait->lock_acquired: pairs with the smp_store_release in -+ * __six_lock_wakeup -+ */ -+ if (smp_load_acquire(&wait->lock_acquired)) -+ break; -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (unlikely(ret)) { -+ bool acquired; -+ -+ /* -+ * If should_sleep_fn() returns an error, we are -+ * required to return that error even if we already -+ * acquired the lock - should_sleep_fn() might have -+ * modified external state (e.g. when the deadlock cycle -+ * detector in bcachefs issued a transaction restart) -+ */ -+ raw_spin_lock(&lock->wait_lock); -+ acquired = wait->lock_acquired; -+ if (!acquired) -+ list_del(&wait->list); -+ raw_spin_unlock(&lock->wait_lock); -+ -+ if (unlikely(acquired)) -+ do_six_unlock_type(lock, type); -+ break; -+ } -+ -+ schedule(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+out: -+ if (ret && type == SIX_LOCK_write) { -+ six_clear_bitmask(lock, SIX_LOCK_HELD_write); -+ six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); -+ } -+ trace_contention_end(lock, 0); -+ -+ return ret; -+} -+ -+/** -+ * six_lock_ip_waiter - take a lock, with full waitlist interface -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @wait: pointer to wait object, which will be added to lock's waitlist -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * This is the most general six_lock() variant, with parameters to support full -+ * cycle detection for deadlock avoidance. -+ * -+ * The code calling this function must implement tracking of held locks, and the -+ * @wait object should be embedded into the struct that tracks held locks - -+ * which must also be accessible in a thread-safe way. -+ * -+ * @should_sleep_fn should invoke the cycle detector; it should walk each -+ * lock's waiters, and for each waiter recursively walk their held locks. -+ * -+ * When this function must block, @wait will be added to @lock's waitlist before -+ * calling trylock, and before calling @should_sleep_fn, and @wait will not be -+ * removed from the lock waitlist until the lock has been successfully acquired, -+ * or we abort. -+ * -+ * @wait.start_time will be monotonically increasing for any given waitlist, and -+ * thus may be used as a loop cursor. -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ int ret; -+ -+ wait->start_time = 0; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); -+ -+ ret = do_six_trylock(lock, type, true) ? 0 -+ : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); -+ -+ if (ret && type != SIX_LOCK_write) -+ six_release(&lock->dep_map, ip); -+ if (!ret) -+ lock_acquired(&lock->dep_map, ip); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(six_lock_ip_waiter); -+ -+__always_inline -+static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ u32 state; -+ -+ if (type == SIX_LOCK_intent) -+ lock->owner = NULL; -+ -+ if (type == SIX_LOCK_read && -+ lock->readers) { -+ smp_mb(); /* unlock barrier */ -+ this_cpu_dec(*lock->readers); -+ smp_mb(); /* between unlocking and checking for waiters */ -+ state = atomic_read(&lock->state); -+ } else { -+ u32 v = l[type].lock_val; -+ -+ if (type != SIX_LOCK_read) -+ v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; -+ -+ EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); -+ state = atomic_sub_return_release(v, &lock->state); -+ } -+ -+ six_lock_wakeup(lock, state, l[type].unlock_wakeup); -+} -+ -+/** -+ * six_unlock_ip - drop a six lock -+ * @lock: lock to unlock -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * When a lock is held multiple times (because six_lock_incement()) was used), -+ * this decrements the 'lock held' counter by one. -+ * -+ * For example: -+ * six_lock_read(&foo->lock); read count 1 -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 -+ */ -+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) -+{ -+ EBUG_ON(type == SIX_LOCK_write && -+ !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); -+ EBUG_ON((type == SIX_LOCK_write || -+ type == SIX_LOCK_intent) && -+ lock->owner != current); -+ -+ if (type != SIX_LOCK_write) -+ six_release(&lock->dep_map, ip); -+ else -+ lock->seq++; -+ -+ if (type == SIX_LOCK_intent && -+ lock->intent_lock_recurse) { -+ --lock->intent_lock_recurse; -+ return; -+ } -+ -+ do_six_unlock_type(lock, type); -+} -+EXPORT_SYMBOL_GPL(six_unlock_ip); -+ -+/** -+ * six_lock_downgrade - convert an intent lock to a read lock -+ * @lock: lock to dowgrade -+ * -+ * @lock will have read count incremented and intent count decremented -+ */ -+void six_lock_downgrade(struct six_lock *lock) -+{ -+ six_lock_increment(lock, SIX_LOCK_read); -+ six_unlock_intent(lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_downgrade); -+ -+/** -+ * six_lock_tryupgrade - attempt to convert read lock to an intent lock -+ * @lock: lock to upgrade -+ * -+ * On success, @lock will have intent count incremented and read count -+ * decremented -+ * -+ * Return: true on success, false on failure -+ */ -+bool six_lock_tryupgrade(struct six_lock *lock) -+{ -+ u32 old = atomic_read(&lock->state), new; -+ -+ do { -+ new = old; -+ -+ if (new & SIX_LOCK_HELD_intent) -+ return false; -+ -+ if (!lock->readers) { -+ EBUG_ON(!(new & SIX_LOCK_HELD_read)); -+ new -= l[SIX_LOCK_read].lock_val; -+ } -+ -+ new |= SIX_LOCK_HELD_intent; -+ } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); -+ -+ if (lock->readers) -+ this_cpu_dec(*lock->readers); -+ -+ six_set_owner(lock, SIX_LOCK_intent, old, current); -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_lock_tryupgrade); -+ -+/** -+ * six_trylock_convert - attempt to convert a held lock from one type to another -+ * @lock: lock to upgrade -+ * @from: SIX_LOCK_read or SIX_LOCK_intent -+ * @to: SIX_LOCK_read or SIX_LOCK_intent -+ * -+ * On success, @lock will have intent count incremented and read count -+ * decremented -+ * -+ * Return: true on success, false on failure -+ */ -+bool six_trylock_convert(struct six_lock *lock, -+ enum six_lock_type from, -+ enum six_lock_type to) -+{ -+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); -+ -+ if (to == from) -+ return true; -+ -+ if (to == SIX_LOCK_read) { -+ six_lock_downgrade(lock); -+ return true; -+ } else { -+ return six_lock_tryupgrade(lock); -+ } -+} -+EXPORT_SYMBOL_GPL(six_trylock_convert); -+ -+/** -+ * six_lock_increment - increase held lock count on a lock that is already held -+ * @lock: lock to increment -+ * @type: SIX_LOCK_read or SIX_LOCK_intent -+ * -+ * @lock must already be held, with a lock type that is greater than or equal to -+ * @type -+ * -+ * A corresponding six_unlock_type() call will be required for @lock to be fully -+ * unlocked. -+ */ -+void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -+{ -+ six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); -+ -+ /* XXX: assert already locked, and that we don't overflow: */ -+ -+ switch (type) { -+ case SIX_LOCK_read: -+ if (lock->readers) { -+ this_cpu_inc(*lock->readers); -+ } else { -+ EBUG_ON(!(atomic_read(&lock->state) & -+ (SIX_LOCK_HELD_read| -+ SIX_LOCK_HELD_intent))); -+ atomic_add(l[type].lock_val, &lock->state); -+ } -+ break; -+ case SIX_LOCK_intent: -+ EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); -+ lock->intent_lock_recurse++; -+ break; -+ case SIX_LOCK_write: -+ BUG(); -+ break; -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_increment); -+ -+/** -+ * six_lock_wakeup_all - wake up all waiters on @lock -+ * @lock: lock to wake up waiters for -+ * -+ * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then -+ * abort the lock operation. -+ * -+ * This function is never needed in a bug-free program; it's only useful in -+ * debug code, e.g. to determine if a cycle detector is at fault. -+ */ -+void six_lock_wakeup_all(struct six_lock *lock) -+{ -+ u32 state = atomic_read(&lock->state); -+ struct six_lock_waiter *w; -+ -+ six_lock_wakeup(lock, state, SIX_LOCK_read); -+ six_lock_wakeup(lock, state, SIX_LOCK_intent); -+ six_lock_wakeup(lock, state, SIX_LOCK_write); -+ -+ raw_spin_lock(&lock->wait_lock); -+ list_for_each_entry(w, &lock->wait_list, list) -+ wake_up_process(w->task); -+ raw_spin_unlock(&lock->wait_lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -+ -+/** -+ * six_lock_counts - return held lock counts, for each lock type -+ * @lock: lock to return counters for -+ * -+ * Return: the number of times a lock is held for read, intent and write. -+ */ -+struct six_lock_count six_lock_counts(struct six_lock *lock) -+{ -+ struct six_lock_count ret; -+ -+ ret.n[SIX_LOCK_read] = !lock->readers -+ ? atomic_read(&lock->state) & SIX_LOCK_HELD_read -+ : pcpu_read_count(lock); -+ ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + -+ lock->intent_lock_recurse; -+ ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); -+ -+ return ret; -+} -+EXPORT_SYMBOL_GPL(six_lock_counts); -+ -+/** -+ * six_lock_readers_add - directly manipulate reader count of a lock -+ * @lock: lock to add/subtract readers for -+ * @nr: reader count to add/subtract -+ * -+ * When an upper layer is implementing lock reentrency, we may have both read -+ * and intent locks on the same lock. -+ * -+ * When we need to take a write lock, the read locks will cause self-deadlock, -+ * because six locks themselves do not track which read locks are held by the -+ * current thread and which are held by a different thread - it does no -+ * per-thread tracking of held locks. -+ * -+ * The upper layer that is tracking held locks may however, if trylock() has -+ * failed, count up its own read locks, subtract them, take the write lock, and -+ * then re-add them. -+ * -+ * As in any other situation when taking a write lock, @lock must be held for -+ * intent one (or more) times, so @lock will never be left unlocked. -+ */ -+void six_lock_readers_add(struct six_lock *lock, int nr) -+{ -+ if (lock->readers) { -+ this_cpu_add(*lock->readers, nr); -+ } else { -+ EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); -+ /* reader count starts at bit 0 */ -+ atomic_add(nr, &lock->state); -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_readers_add); -+ -+/** -+ * six_lock_exit - release resources held by a lock prior to freeing -+ * @lock: lock to exit -+ * -+ * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is -+ * required to free the percpu read counts. -+ */ -+void six_lock_exit(struct six_lock *lock) -+{ -+ WARN_ON(lock->readers && pcpu_read_count(lock)); -+ WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); -+ -+ free_percpu(lock->readers); -+ lock->readers = NULL; -+} -+EXPORT_SYMBOL_GPL(six_lock_exit); -+ -+void __six_lock_init(struct six_lock *lock, const char *name, -+ struct lock_class_key *key, enum six_lock_init_flags flags) -+{ -+ atomic_set(&lock->state, 0); -+ raw_spin_lock_init(&lock->wait_lock); -+ INIT_LIST_HEAD(&lock->wait_list); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); -+ lockdep_init_map(&lock->dep_map, name, key, 0); -+#endif -+ -+ /* -+ * Don't assume that we have real percpu variables available in -+ * userspace: -+ */ -+#ifdef __KERNEL__ -+ if (flags & SIX_LOCK_INIT_PCPU) { -+ /* -+ * We don't return an error here on memory allocation failure -+ * since percpu is an optimization, and locks will work with the -+ * same semantics in non-percpu mode: callers can check for -+ * failure if they wish by checking lock->readers, but generally -+ * will not want to treat it as an error. -+ */ -+ lock->readers = alloc_percpu(unsigned); -+ } -+#endif -+} -+EXPORT_SYMBOL_GPL(__six_lock_init); -diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h -new file mode 100644 -index 000000000..394da423c ---- /dev/null -+++ b/fs/bcachefs/six.h -@@ -0,0 +1,388 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_SIX_H -+#define _LINUX_SIX_H -+ -+/** -+ * DOC: SIX locks overview -+ * -+ * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores -+ * but with an additional state: read/shared, intent, exclusive/write -+ * -+ * The purpose of the intent state is to allow for greater concurrency on tree -+ * structures without deadlocking. In general, a read can't be upgraded to a -+ * write lock without deadlocking, so an operation that updates multiple nodes -+ * will have to take write locks for the full duration of the operation. -+ * -+ * But by adding an intent state, which is exclusive with other intent locks but -+ * not with readers, we can take intent locks at thte start of the operation, -+ * and then take write locks only for the actual update to each individual -+ * nodes, without deadlocking. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * An intent lock must be held before taking a write lock: -+ * six_lock_intent(&foo->lock); -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * Other operations: -+ * six_trylock_read() -+ * six_trylock_intent() -+ * six_trylock_write() -+ * -+ * six_lock_downgrade() convert from intent to read -+ * six_lock_tryupgrade() attempt to convert from read to intent, may fail -+ * -+ * There are also interfaces that take the lock type as an enum: -+ * -+ * six_lock_type(&foo->lock, SIX_LOCK_read); -+ * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) -+ * six_lock_type(&foo->lock, SIX_LOCK_write); -+ * six_unlock_type(&foo->lock, SIX_LOCK_write); -+ * six_unlock_type(&foo->lock, SIX_LOCK_intent); -+ * -+ * Lock sequence numbers - unlock(), relock(): -+ * -+ * Locks embed sequences numbers, which are incremented on write lock/unlock. -+ * This allows locks to be dropped and the retaken iff the state they protect -+ * hasn't changed; this makes it much easier to avoid holding locks while e.g. -+ * doing IO or allocating memory. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * u32 seq = six_lock_seq(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * some_operation_that_may_block(); -+ * -+ * if (six_relock_read(&foo->lock, seq)) { ... } -+ * -+ * If the relock operation succeeds, it is as if the lock was never unlocked. -+ * -+ * Reentrancy: -+ * -+ * Six locks are not by themselves reentrent, but have counters for both the -+ * read and intent states that can be used to provide reentrency by an upper -+ * layer that tracks held locks. If a lock is known to already be held in the -+ * read or intent state, six_lock_increment() can be used to bump the "lock -+ * held in this state" counter, increasing the number of unlock calls that -+ * will be required to fully unlock it. -+ * -+ * Example usage: -+ * six_lock_read(&foo->lock); -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); -+ * six_unlock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * foo->lock is now fully unlocked. -+ * -+ * Since the intent state supercedes read, it's legal to increment the read -+ * counter when holding an intent lock, but not the reverse. -+ * -+ * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) -+ * is not legal. -+ * -+ * should_sleep_fn: -+ * -+ * There is a six_lock() variant that takes a function pointer that is called -+ * immediately prior to schedule() when blocking, and may return an error to -+ * abort. -+ * -+ * One possible use for this feature is when objects being locked are part of -+ * a cache and may reused, and lock ordering is based on a property of the -+ * object that will change when the object is reused - i.e. logical key order. -+ * -+ * If looking up an object in the cache may race with object reuse, and lock -+ * ordering is required to prevent deadlock, object reuse may change the -+ * correct lock order for that object and cause a deadlock. should_sleep_fn -+ * can be used to check if the object is still the object we want and avoid -+ * this deadlock. -+ * -+ * Wait list entry interface: -+ * -+ * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a -+ * wait list entry. By embedding six_lock_waiter into another object, and by -+ * traversing lock waitlists, it is then possible for an upper layer to -+ * implement full cycle detection for deadlock avoidance. -+ * -+ * should_sleep_fn should be used for invoking the cycle detector, walking the -+ * graph of held locks to check for a deadlock. The upper layer must track -+ * held locks for each thread, and each thread's held locks must be reachable -+ * from its six_lock_waiter object. -+ * -+ * six_lock_waiter() will add the wait object to the waitlist re-trying taking -+ * the lock, and before calling should_sleep_fn, and the wait object will not -+ * be removed from the waitlist until either the lock has been successfully -+ * acquired, or we aborted because should_sleep_fn returned an error. -+ * -+ * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will -+ * have timestamps in strictly ascending order - this is so the timestamp can -+ * be used as a cursor for lock graph traverse. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+enum six_lock_type { -+ SIX_LOCK_read, -+ SIX_LOCK_intent, -+ SIX_LOCK_write, -+}; -+ -+struct six_lock { -+ atomic_t state; -+ u32 seq; -+ unsigned intent_lock_recurse; -+ struct task_struct *owner; -+ unsigned __percpu *readers; -+ struct optimistic_spin_queue osq; -+ raw_spinlock_t wait_lock; -+ struct list_head wait_list; -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map dep_map; -+#endif -+}; -+ -+struct six_lock_waiter { -+ struct list_head list; -+ struct task_struct *task; -+ enum six_lock_type lock_want; -+ bool lock_acquired; -+ u64 start_time; -+}; -+ -+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -+ -+void six_lock_exit(struct six_lock *lock); -+ -+enum six_lock_init_flags { -+ SIX_LOCK_INIT_PCPU = 1U << 0, -+}; -+ -+void __six_lock_init(struct six_lock *lock, const char *name, -+ struct lock_class_key *key, enum six_lock_init_flags flags); -+ -+/** -+ * six_lock_init - initialize a six lock -+ * @lock: lock to initialize -+ * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU -+ */ -+#define six_lock_init(lock, flags) \ -+do { \ -+ static struct lock_class_key __key; \ -+ \ -+ __six_lock_init((lock), #lock, &__key, flags); \ -+} while (0) -+ -+/** -+ * six_lock_seq - obtain current lock sequence number -+ * @lock: six_lock to obtain sequence number for -+ * -+ * @lock should be held for read or intent, and not write -+ * -+ * By saving the lock sequence number, we can unlock @lock and then (typically -+ * after some blocking operation) attempt to relock it: the relock will succeed -+ * if the sequence number hasn't changed, meaning no write locks have been taken -+ * and state corresponding to what @lock protects is still valid. -+ */ -+static inline u32 six_lock_seq(const struct six_lock *lock) -+{ -+ return lock->seq; -+} -+ -+bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); -+ -+/** -+ * six_trylock_type - attempt to take a six lock without blocking -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * -+ * Return: true on success, false on failure. -+ */ -+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ return six_trylock_ip(lock, type, _THIS_IP_); -+} -+ -+int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip); -+ -+/** -+ * six_lock_waiter - take a lock, with full waitlist interface -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @wait: pointer to wait object, which will be added to lock's waitlist -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * -+ * This is a convenience wrapper around six_lock_ip_waiter(), see that function -+ * for full documentation. -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, -+ struct six_lock_waiter *wait, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); -+} -+ -+/** -+ * six_lock_ip - take a six lock lock -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p, -+ unsigned long ip) -+{ -+ struct six_lock_waiter wait; -+ -+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); -+} -+ -+/** -+ * six_lock_type - take a six lock lock -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @should_sleep_fn: callback run after adding to waitlist, immediately prior -+ * to scheduling -+ * @p: passed through to @should_sleep_fn -+ * -+ * Return: 0 on success, or the return code from @should_sleep_fn on failure. -+ */ -+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ struct six_lock_waiter wait; -+ -+ return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); -+} -+ -+bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq, unsigned long ip); -+ -+/** -+ * six_relock_type - attempt to re-take a lock that was held previously -+ * @lock: lock to take -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * @seq: lock sequence number obtained from six_lock_seq() while lock was -+ * held previously -+ * -+ * Return: true on success, false on failure. -+ */ -+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ return six_relock_ip(lock, type, seq, _THIS_IP_); -+} -+ -+void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); -+ -+/** -+ * six_unlock_type - drop a six lock -+ * @lock: lock to unlock -+ * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write -+ * -+ * When a lock is held multiple times (because six_lock_incement()) was used), -+ * this decrements the 'lock held' counter by one. -+ * -+ * For example: -+ * six_lock_read(&foo->lock); read count 1 -+ * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 -+ * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 -+ */ -+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ six_unlock_ip(lock, type, _THIS_IP_); -+} -+ -+#define __SIX_LOCK(type) \ -+static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ -+{ \ -+ return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ -+} \ -+ \ -+static inline bool six_trylock_##type(struct six_lock *lock) \ -+{ \ -+ return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -+} \ -+ \ -+static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ -+ struct six_lock_waiter *wait, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p,\ -+ unsigned long ip) \ -+{ \ -+ return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ -+} \ -+ \ -+static inline int six_lock_ip_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p, \ -+ unsigned long ip) \ -+{ \ -+ return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ -+} \ -+ \ -+static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ -+{ \ -+ return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ -+} \ -+ \ -+static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ -+{ \ -+ return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ -+} \ -+ \ -+static inline int six_lock_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn fn, void *p)\ -+{ \ -+ return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ -+} \ -+ \ -+static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ -+{ \ -+ six_unlock_ip(lock, SIX_LOCK_##type, ip); \ -+} \ -+ \ -+static inline void six_unlock_##type(struct six_lock *lock) \ -+{ \ -+ six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ -+} -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+#undef __SIX_LOCK -+ -+void six_lock_downgrade(struct six_lock *); -+bool six_lock_tryupgrade(struct six_lock *); -+bool six_trylock_convert(struct six_lock *, enum six_lock_type, -+ enum six_lock_type); -+ -+void six_lock_increment(struct six_lock *, enum six_lock_type); -+ -+void six_lock_wakeup_all(struct six_lock *); -+ -+struct six_lock_count { -+ unsigned n[3]; -+}; -+ -+struct six_lock_count six_lock_counts(struct six_lock *); -+void six_lock_readers_add(struct six_lock *, int); -+ -+#endif /* _LINUX_SIX_H */ -diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c -new file mode 100644 -index 000000000..9da099114 ---- /dev/null -+++ b/fs/bcachefs/snapshot.c -@@ -0,0 +1,1687 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_buf.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "errcode.h" -+#include "error.h" -+#include "fs.h" -+#include "snapshot.h" -+ -+#include -+ -+/* -+ * Snapshot trees: -+ * -+ * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they -+ * exist to provide a stable identifier for the whole lifetime of a snapshot -+ * tree. -+ */ -+ -+void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); -+ -+ prt_printf(out, "subvol %u root snapshot %u", -+ le32_to_cpu(t.v->master_subvol), -+ le32_to_cpu(t.v->root_snapshot)); -+} -+ -+int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, -+ struct bch_snapshot_tree *s) -+{ -+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), -+ BTREE_ITER_WITH_UPDATES, snapshot_tree, s); -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = -BCH_ERR_ENOENT_snapshot_tree; -+ return ret; -+} -+ -+struct bkey_i_snapshot_tree * -+__bch2_snapshot_tree_create(struct btree_trans *trans) -+{ -+ struct btree_iter iter; -+ int ret = bch2_bkey_get_empty_slot(trans, &iter, -+ BTREE_ID_snapshot_trees, POS(0, U32_MAX)); -+ struct bkey_i_snapshot_tree *s_t; -+ -+ if (ret == -BCH_ERR_ENOSPC_btree_slot) -+ ret = -BCH_ERR_ENOSPC_snapshot_tree; -+ if (ret) -+ return ERR_PTR(ret); -+ -+ s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(s_t); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret ? ERR_PTR(ret) : s_t; -+} -+ -+static int bch2_snapshot_tree_create(struct btree_trans *trans, -+ u32 root_id, u32 subvol_id, u32 *tree_id) -+{ -+ struct bkey_i_snapshot_tree *n_tree = -+ __bch2_snapshot_tree_create(trans); -+ -+ if (IS_ERR(n_tree)) -+ return PTR_ERR(n_tree); -+ -+ n_tree->v.master_subvol = cpu_to_le32(subvol_id); -+ n_tree->v.root_snapshot = cpu_to_le32(root_id); -+ *tree_id = n_tree->k.p.offset; -+ return 0; -+} -+ -+/* Snapshot nodes: */ -+ -+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ struct snapshot_table *t; -+ -+ rcu_read_lock(); -+ t = rcu_dereference(c->snapshots); -+ -+ while (id && id < ancestor) -+ id = __snapshot_t(t, id)->parent; -+ rcu_read_unlock(); -+ -+ return id == ancestor; -+} -+ -+static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) -+{ -+ const struct snapshot_t *s = __snapshot_t(t, id); -+ -+ if (s->skip[2] <= ancestor) -+ return s->skip[2]; -+ if (s->skip[1] <= ancestor) -+ return s->skip[1]; -+ if (s->skip[0] <= ancestor) -+ return s->skip[0]; -+ return s->parent; -+} -+ -+bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ struct snapshot_table *t; -+ bool ret; -+ -+ EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); -+ -+ rcu_read_lock(); -+ t = rcu_dereference(c->snapshots); -+ -+ while (id && id < ancestor - IS_ANCESTOR_BITMAP) -+ id = get_ancestor_below(t, id, ancestor); -+ -+ if (id && id < ancestor) { -+ ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor); -+ -+ EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor)); -+ } else { -+ ret = id == ancestor; -+ } -+ -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+struct snapshot_t_free_rcu { -+ struct rcu_head rcu; -+ struct snapshot_table *t; -+}; -+ -+static void snapshot_t_free_rcu(struct rcu_head *rcu) -+{ -+ struct snapshot_t_free_rcu *free_rcu = -+ container_of(rcu, struct snapshot_t_free_rcu, rcu); -+ -+ kvfree(free_rcu->t); -+ kfree(free_rcu); -+} -+ -+static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) -+{ -+ size_t idx = U32_MAX - id; -+ size_t new_size; -+ struct snapshot_table *new, *old; -+ -+ new_size = max(16UL, roundup_pow_of_two(idx + 1)); -+ -+ new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); -+ if (!new) -+ return NULL; -+ -+ old = rcu_dereference_protected(c->snapshots, true); -+ if (old) -+ memcpy(new->s, -+ rcu_dereference_protected(c->snapshots, true)->s, -+ sizeof(new->s[0]) * c->snapshot_table_size); -+ -+ rcu_assign_pointer(c->snapshots, new); -+ c->snapshot_table_size = new_size; -+ if (old) { -+ struct snapshot_t_free_rcu *rcu = -+ kmalloc(sizeof(*rcu), GFP_KERNEL|__GFP_NOFAIL); -+ -+ rcu->t = old; -+ call_rcu(&rcu->rcu, snapshot_t_free_rcu); -+ } -+ -+ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; -+} -+ -+static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) -+{ -+ size_t idx = U32_MAX - id; -+ -+ lockdep_assert_held(&c->snapshot_table_lock); -+ -+ if (likely(idx < c->snapshot_table_size)) -+ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; -+ -+ return __snapshot_t_mut(c, id); -+} -+ -+void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); -+ -+ prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", -+ BCH_SNAPSHOT_SUBVOL(s.v), -+ BCH_SNAPSHOT_DELETED(s.v), -+ le32_to_cpu(s.v->parent), -+ le32_to_cpu(s.v->children[0]), -+ le32_to_cpu(s.v->children[1]), -+ le32_to_cpu(s.v->subvol), -+ le32_to_cpu(s.v->tree)); -+ -+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth)) -+ prt_printf(out, " depth %u skiplist %u %u %u", -+ le32_to_cpu(s.v->depth), -+ le32_to_cpu(s.v->skip[0]), -+ le32_to_cpu(s.v->skip[1]), -+ le32_to_cpu(s.v->skip[2])); -+} -+ -+int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ struct bkey_s_c_snapshot s; -+ u32 i, id; -+ -+ if (bkey_gt(k.k->p, POS(0, U32_MAX)) || -+ bkey_lt(k.k->p, POS(0, 1))) { -+ prt_printf(err, "bad pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ s = bkey_s_c_to_snapshot(k); -+ -+ id = le32_to_cpu(s.v->parent); -+ if (id && id <= k.k->p.offset) { -+ prt_printf(err, "bad parent node (%u <= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { -+ prt_printf(err, "children not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (s.v->children[0] && -+ s.v->children[0] == s.v->children[1]) { -+ prt_printf(err, "duplicate child nodes"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ for (i = 0; i < 2; i++) { -+ id = le32_to_cpu(s.v->children[i]); -+ -+ if (id >= k.k->p.offset) { -+ prt_printf(err, "bad child node (%u >= %llu)", -+ id, k.k->p.offset); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ -+ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { -+ if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || -+ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { -+ prt_printf(err, "skiplist not normalized"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { -+ id = le32_to_cpu(s.v->skip[i]); -+ -+ if ((id && !s.v->parent) || -+ (id && id <= k.k->p.offset)) { -+ prt_printf(err, "bad skiplist node %u", id); -+ return -BCH_ERR_invalid_bkey; -+ } -+ } -+ } -+ -+ return 0; -+} -+ -+static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -+{ -+ struct snapshot_t *t = snapshot_t_mut(c, id); -+ u32 parent = id; -+ -+ while ((parent = bch2_snapshot_parent_early(c, parent)) && -+ parent - id - 1 < IS_ANCESTOR_BITMAP) -+ __set_bit(parent - id - 1, t->is_ancestor); -+} -+ -+static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id) -+{ -+ mutex_lock(&c->snapshot_table_lock); -+ __set_is_ancestor_bitmap(c, id); -+ mutex_unlock(&c->snapshot_table_lock); -+} -+ -+int bch2_mark_snapshot(struct btree_trans *trans, -+ enum btree_id btree, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct snapshot_t *t; -+ u32 id = new.k->p.offset; -+ int ret = 0; -+ -+ mutex_lock(&c->snapshot_table_lock); -+ -+ t = snapshot_t_mut(c, id); -+ if (!t) { -+ ret = -BCH_ERR_ENOMEM_mark_snapshot; -+ goto err; -+ } -+ -+ if (new.k->type == KEY_TYPE_snapshot) { -+ struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); -+ -+ t->parent = le32_to_cpu(s.v->parent); -+ t->children[0] = le32_to_cpu(s.v->children[0]); -+ t->children[1] = le32_to_cpu(s.v->children[1]); -+ t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; -+ t->tree = le32_to_cpu(s.v->tree); -+ -+ if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) { -+ t->depth = le32_to_cpu(s.v->depth); -+ t->skip[0] = le32_to_cpu(s.v->skip[0]); -+ t->skip[1] = le32_to_cpu(s.v->skip[1]); -+ t->skip[2] = le32_to_cpu(s.v->skip[2]); -+ } else { -+ t->depth = 0; -+ t->skip[0] = 0; -+ t->skip[1] = 0; -+ t->skip[2] = 0; -+ } -+ -+ __set_is_ancestor_bitmap(c, id); -+ -+ if (BCH_SNAPSHOT_DELETED(s.v)) { -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_delete_dead_snapshots); -+ } -+ } else { -+ memset(t, 0, sizeof(*t)); -+ } -+err: -+ mutex_unlock(&c->snapshot_table_lock); -+ return ret; -+} -+ -+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, -+ struct bch_snapshot *s) -+{ -+ return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), -+ BTREE_ITER_WITH_UPDATES, snapshot, s); -+} -+ -+int bch2_snapshot_live(struct btree_trans *trans, u32 id) -+{ -+ struct bch_snapshot v; -+ int ret; -+ -+ if (!id) -+ return 0; -+ -+ ret = bch2_snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(trans->c, "snapshot node %u not found", id); -+ if (ret) -+ return ret; -+ -+ return !BCH_SNAPSHOT_DELETED(&v); -+} -+ -+/* -+ * If @k is a snapshot with just one live child, it's part of a linear chain, -+ * which we consider to be an equivalence class: and then after snapshot -+ * deletion cleanup, there should only be a single key at a given position in -+ * this equivalence class. -+ * -+ * This sets the equivalence class of @k to be the child's equivalence class, if -+ * it's part of such a linear chain: this correctly sets equivalence classes on -+ * startup if we run leaf to root (i.e. in natural key order). -+ */ -+int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i, nr_live = 0, live_idx = 0; -+ struct bkey_s_c_snapshot snap; -+ u32 id = k.k->p.offset, child[2]; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ -+ child[0] = le32_to_cpu(snap.v->children[0]); -+ child[1] = le32_to_cpu(snap.v->children[1]); -+ -+ for (i = 0; i < 2; i++) { -+ int ret = bch2_snapshot_live(trans, child[i]); -+ -+ if (ret < 0) -+ return ret; -+ -+ if (ret) -+ live_idx = i; -+ nr_live += ret; -+ } -+ -+ mutex_lock(&c->snapshot_table_lock); -+ -+ snapshot_t_mut(c, id)->equiv = nr_live == 1 -+ ? snapshot_t_mut(c, child[live_idx])->equiv -+ : id; -+ -+ mutex_unlock(&c->snapshot_table_lock); -+ -+ return 0; -+} -+ -+/* fsck: */ -+ -+static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) -+{ -+ return snapshot_t(c, id)->children[child]; -+} -+ -+static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) -+{ -+ return bch2_snapshot_child(c, id, 0); -+} -+ -+static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) -+{ -+ return bch2_snapshot_child(c, id, 1); -+} -+ -+static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) -+{ -+ u32 n, parent; -+ -+ n = bch2_snapshot_left_child(c, id); -+ if (n) -+ return n; -+ -+ while ((parent = bch2_snapshot_parent(c, id))) { -+ n = bch2_snapshot_right_child(c, parent); -+ if (n && n != id) -+ return n; -+ id = parent; -+ } -+ -+ return 0; -+} -+ -+static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) -+{ -+ u32 id = snapshot_root; -+ u32 subvol = 0, s; -+ -+ while (id) { -+ s = snapshot_t(c, id)->subvol; -+ -+ if (s && (!subvol || s < subvol)) -+ subvol = s; -+ -+ id = bch2_snapshot_tree_next(c, id); -+ } -+ -+ return subvol; -+} -+ -+static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, -+ u32 snapshot_root, u32 *subvol_id) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_subvolume s; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, -+ 0, k, ret) { -+ if (k.k->type != KEY_TYPE_subvolume) -+ continue; -+ -+ s = bkey_s_c_to_subvolume(k); -+ if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) -+ continue; -+ if (!BCH_SUBVOLUME_SNAP(s.v)) { -+ *subvol_id = s.k->p.offset; -+ found = true; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ -+ if (!ret && !found) { -+ struct bkey_i_subvolume *s; -+ -+ *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); -+ -+ s = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_subvolumes, POS(0, *subvol_id), -+ 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ return ret; -+ -+ SET_BCH_SUBVOLUME_SNAP(&s->v, false); -+ } -+ -+ return ret; -+} -+ -+static int check_snapshot_tree(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c_snapshot_tree st; -+ struct bch_snapshot s; -+ struct bch_subvolume subvol; -+ struct printbuf buf = PRINTBUF; -+ u32 root_id; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_snapshot_tree) -+ return 0; -+ -+ st = bkey_s_c_to_snapshot_tree(k); -+ root_id = le32_to_cpu(st.v->root_snapshot); -+ -+ ret = bch2_snapshot_lookup(trans, root_id, &s); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret || -+ root_id != bch2_snapshot_root(c, root_id) || -+ st.k->p.offset != le32_to_cpu(s.tree), -+ c, -+ "snapshot tree points to missing/incorrect snapshot:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { -+ ret = bch2_btree_delete_at(trans, iter, 0); -+ goto err; -+ } -+ -+ ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), -+ false, 0, &subvol); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ goto err; -+ -+ if (fsck_err_on(ret, c, -+ "snapshot tree points to missing subvolume:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(!bch2_snapshot_is_ancestor_early(c, -+ le32_to_cpu(subvol.snapshot), -+ root_id), c, -+ "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, -+ "snapshot tree points to snapshot subvolume:\n %s", -+ (printbuf_reset(&buf), -+ bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { -+ struct bkey_i_snapshot_tree *u; -+ u32 subvol_id; -+ -+ ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); -+ if (ret) -+ goto err; -+ -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.master_subvol = cpu_to_le32(subvol_id); -+ st = snapshot_tree_i_to_s_c(u); -+ } -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+/* -+ * For each snapshot_tree, make sure it points to the root of a snapshot tree -+ * and that snapshot entry points back to it, or delete it. -+ * -+ * And, make sure it points to a subvolume within that snapshot tree, or correct -+ * it to point to the oldest subvolume within that snapshot tree. -+ */ -+int bch2_check_snapshot_trees(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_snapshot_trees, POS_MIN, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_snapshot_tree(&trans, &iter, k))); -+ -+ if (ret) -+ bch_err(c, "error %i checking snapshot trees", ret); -+ return ret; -+} -+ -+/* -+ * Look up snapshot tree for @tree_id and find root, -+ * make sure @snap_id is a descendent: -+ */ -+static int snapshot_tree_ptr_good(struct btree_trans *trans, -+ u32 snap_id, u32 tree_id) -+{ -+ struct bch_snapshot_tree s_t; -+ int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ return 0; -+ if (ret) -+ return ret; -+ -+ return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); -+} -+ -+u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ -+ if (!id) -+ return 0; -+ -+ rcu_read_lock(); -+ s = snapshot_t(c, id); -+ if (s->parent) -+ id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) -+{ -+ unsigned i; -+ -+ for (i = 0; i < 3; i++) -+ if (!s.parent) { -+ if (s.skip[i]) -+ return false; -+ } else { -+ if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i]))) -+ return false; -+ } -+ -+ return true; -+} -+ -+/* -+ * snapshot_tree pointer was incorrect: look up root snapshot node, make sure -+ * its snapshot_tree pointer is correct (allocate new one if necessary), then -+ * update this node's pointer to root node's pointer: -+ */ -+static int snapshot_tree_ptr_repair(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ struct bch_snapshot *s) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter root_iter; -+ struct bch_snapshot_tree s_t; -+ struct bkey_s_c_snapshot root; -+ struct bkey_i_snapshot *u; -+ u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; -+ int ret; -+ -+ root = bch2_bkey_get_iter_typed(trans, &root_iter, -+ BTREE_ID_snapshots, POS(0, root_id), -+ BTREE_ITER_WITH_UPDATES, snapshot); -+ ret = bkey_err(root); -+ if (ret) -+ goto err; -+ -+ tree_id = le32_to_cpu(root.v->tree); -+ -+ ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); -+ if (ret && !bch2_err_matches(ret, ENOENT)) -+ return ret; -+ -+ if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { -+ u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u) ?: -+ bch2_snapshot_tree_create(trans, root_id, -+ bch2_snapshot_tree_oldest_subvol(c, root_id), -+ &tree_id); -+ if (ret) -+ goto err; -+ -+ u->v.tree = cpu_to_le32(tree_id); -+ if (k.k->p.offset == root_id) -+ *s = u->v; -+ } -+ -+ if (k.k->p.offset != root_id) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.tree = cpu_to_le32(tree_id); -+ *s = u->v; -+ } -+err: -+ bch2_trans_iter_exit(trans, &root_iter); -+ return ret; -+} -+ -+static int check_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_snapshot s; -+ struct bch_subvolume subvol; -+ struct bch_snapshot v; -+ struct bkey_i_snapshot *u; -+ u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); -+ u32 real_depth; -+ struct printbuf buf = PRINTBUF; -+ bool should_have_subvol; -+ u32 i, id; -+ int ret = 0; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ memset(&s, 0, sizeof(s)); -+ memcpy(&s, k.v, bkey_val_bytes(k.k)); -+ -+ id = le32_to_cpu(s.parent); -+ if (id) { -+ ret = bch2_snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot with nonexistent parent:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ if (ret) -+ goto err; -+ -+ if (le32_to_cpu(v.children[0]) != k.k->p.offset && -+ le32_to_cpu(v.children[1]) != k.k->p.offset) { -+ bch_err(c, "snapshot parent %u missing pointer to child %llu", -+ id, k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } -+ -+ for (i = 0; i < 2 && s.children[i]; i++) { -+ id = le32_to_cpu(s.children[i]); -+ -+ ret = bch2_snapshot_lookup(trans, id, &v); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot node %llu has nonexistent child %u", -+ k.k->p.offset, id); -+ if (ret) -+ goto err; -+ -+ if (le32_to_cpu(v.parent) != k.k->p.offset) { -+ bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", -+ id, le32_to_cpu(v.parent), k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } -+ -+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && -+ !BCH_SNAPSHOT_DELETED(&s); -+ -+ if (should_have_subvol) { -+ id = le32_to_cpu(s.subvol); -+ ret = bch2_subvolume_get(trans, id, 0, false, &subvol); -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "snapshot points to nonexistent subvolume:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); -+ if (ret) -+ goto err; -+ -+ if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { -+ bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", -+ k.k->p.offset); -+ ret = -EINVAL; -+ goto err; -+ } -+ } else { -+ if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.subvol = 0; -+ s = u->v; -+ } -+ } -+ -+ ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); -+ if (ret < 0) -+ goto err; -+ -+ if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { -+ ret = snapshot_tree_ptr_repair(trans, iter, k, &s); -+ if (ret) -+ goto err; -+ } -+ ret = 0; -+ -+ real_depth = bch2_snapshot_depth(c, parent_id); -+ -+ if (le32_to_cpu(s.depth) != real_depth && -+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with incorrect depth field, should be %u:\n %s", -+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ u->v.depth = cpu_to_le32(real_depth); -+ s = u->v; -+ } -+ -+ ret = snapshot_skiplist_good(trans, k.k->p.offset, s); -+ if (ret < 0) -+ goto err; -+ -+ if (!ret && -+ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || -+ fsck_err(c, "snapshot with bad skiplist field:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) -+ u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id)); -+ -+ bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32); -+ s = u->v; -+ } -+ ret = 0; -+err: -+fsck_err: -+ printbuf_exit(&buf); -+ return ret; -+} -+ -+int bch2_check_snapshots(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ /* -+ * We iterate backwards as checking/fixing the depth field requires that -+ * the parent's depth already be correct: -+ */ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_reverse_commit(&trans, iter, -+ BTREE_ID_snapshots, POS_MAX, -+ BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_snapshot(&trans, &iter, k))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* -+ * Mark a snapshot as deleted, for future cleanup: -+ */ -+int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) -+{ -+ struct btree_iter iter; -+ struct bkey_i_snapshot *s; -+ int ret = 0; -+ -+ s = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_snapshots, POS(0, id), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (unlikely(ret)) { -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), -+ trans->c, "missing snapshot %u", id); -+ return ret; -+ } -+ -+ /* already deleted? */ -+ if (BCH_SNAPSHOT_DELETED(&s->v)) -+ goto err; -+ -+ SET_BCH_SNAPSHOT_DELETED(&s->v, true); -+ SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); -+ s->v.subvol = 0; -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) -+{ -+ if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1])) -+ swap(s->children[0], s->children[1]); -+} -+ -+int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; -+ struct btree_iter c_iter = (struct btree_iter) { NULL }; -+ struct btree_iter tree_iter = (struct btree_iter) { NULL }; -+ struct bkey_s_c_snapshot s; -+ u32 parent_id, child_id; -+ unsigned i; -+ int ret = 0; -+ -+ s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), -+ BTREE_ITER_INTENT, snapshot); -+ ret = bkey_err(s); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "missing snapshot %u", id); -+ -+ if (ret) -+ goto err; -+ -+ BUG_ON(s.v->children[1]); -+ -+ parent_id = le32_to_cpu(s.v->parent); -+ child_id = le32_to_cpu(s.v->children[0]); -+ -+ if (parent_id) { -+ struct bkey_i_snapshot *parent; -+ -+ parent = bch2_bkey_get_mut_typed(trans, &p_iter, -+ BTREE_ID_snapshots, POS(0, parent_id), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(parent); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "missing snapshot %u", parent_id); -+ if (unlikely(ret)) -+ goto err; -+ -+ /* find entry in parent->children for node being deleted */ -+ for (i = 0; i < 2; i++) -+ if (le32_to_cpu(parent->v.children[i]) == id) -+ break; -+ -+ if (bch2_fs_inconsistent_on(i == 2, c, -+ "snapshot %u missing child pointer to %u", -+ parent_id, id)) -+ goto err; -+ -+ parent->v.children[i] = le32_to_cpu(child_id); -+ -+ normalize_snapshot_child_pointers(&parent->v); -+ } -+ -+ if (child_id) { -+ struct bkey_i_snapshot *child; -+ -+ child = bch2_bkey_get_mut_typed(trans, &c_iter, -+ BTREE_ID_snapshots, POS(0, child_id), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(child); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "missing snapshot %u", child_id); -+ if (unlikely(ret)) -+ goto err; -+ -+ child->v.parent = cpu_to_le32(parent_id); -+ -+ if (!child->v.parent) { -+ child->v.skip[0] = 0; -+ child->v.skip[1] = 0; -+ child->v.skip[2] = 0; -+ } -+ } -+ -+ if (!parent_id) { -+ /* -+ * We're deleting the root of a snapshot tree: update the -+ * snapshot_tree entry to point to the new root, or delete it if -+ * this is the last snapshot ID in this tree: -+ */ -+ struct bkey_i_snapshot_tree *s_t; -+ -+ BUG_ON(s.v->children[1]); -+ -+ s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, -+ BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), -+ 0, snapshot_tree); -+ ret = PTR_ERR_OR_ZERO(s_t); -+ if (ret) -+ goto err; -+ -+ if (s.v->children[0]) { -+ s_t->v.root_snapshot = s.v->children[0]; -+ } else { -+ s_t->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&s_t->k, 0); -+ } -+ } -+ -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+err: -+ bch2_trans_iter_exit(trans, &tree_iter); -+ bch2_trans_iter_exit(trans, &p_iter); -+ bch2_trans_iter_exit(trans, &c_iter); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_i_snapshot *n; -+ struct bkey_s_c k; -+ unsigned i, j; -+ u32 depth = bch2_snapshot_depth(c, parent); -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, -+ POS_MIN, BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ for (i = 0; i < nr_snapids; i++) { -+ k = bch2_btree_iter_prev_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k || !k.k->p.offset) { -+ ret = -BCH_ERR_ENOSPC_snapshot_create; -+ goto err; -+ } -+ -+ n = bch2_bkey_alloc(trans, &iter, 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (ret) -+ goto err; -+ -+ n->v.flags = 0; -+ n->v.parent = cpu_to_le32(parent); -+ n->v.subvol = cpu_to_le32(snapshot_subvols[i]); -+ n->v.tree = cpu_to_le32(tree); -+ n->v.depth = cpu_to_le32(depth); -+ -+ for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) -+ n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent)); -+ -+ bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32); -+ SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); -+ -+ ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, -+ bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); -+ if (ret) -+ goto err; -+ -+ new_snapids[i] = iter.pos.offset; -+ } -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* -+ * Create new snapshot IDs as children of an existing snapshot ID: -+ */ -+static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct btree_iter iter; -+ struct bkey_i_snapshot *n_parent; -+ int ret = 0; -+ -+ n_parent = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_snapshots, POS(0, parent), -+ 0, snapshot); -+ ret = PTR_ERR_OR_ZERO(n_parent); -+ if (unlikely(ret)) { -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(trans->c, "snapshot %u not found", parent); -+ return ret; -+ } -+ -+ if (n_parent->v.children[0] || n_parent->v.children[1]) { -+ bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), -+ new_snapids, snapshot_subvols, nr_snapids); -+ if (ret) -+ goto err; -+ -+ n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); -+ n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); -+ n_parent->v.subvol = 0; -+ SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+/* -+ * Create a snapshot node that is the root of a new tree: -+ */ -+static int bch2_snapshot_node_create_tree(struct btree_trans *trans, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ struct bkey_i_snapshot_tree *n_tree; -+ int ret; -+ -+ n_tree = __bch2_snapshot_tree_create(trans); -+ ret = PTR_ERR_OR_ZERO(n_tree) ?: -+ create_snapids(trans, 0, n_tree->k.p.offset, -+ new_snapids, snapshot_subvols, nr_snapids); -+ if (ret) -+ return ret; -+ -+ n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); -+ n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); -+ return 0; -+} -+ -+int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, -+ u32 *new_snapids, -+ u32 *snapshot_subvols, -+ unsigned nr_snapids) -+{ -+ BUG_ON((parent == 0) != (nr_snapids == 1)); -+ BUG_ON((parent != 0) != (nr_snapids == 2)); -+ -+ return parent -+ ? bch2_snapshot_node_create_children(trans, parent, -+ new_snapids, snapshot_subvols, nr_snapids) -+ : bch2_snapshot_node_create_tree(trans, -+ new_snapids, snapshot_subvols, nr_snapids); -+ -+} -+ -+/* -+ * If we have an unlinked inode in an internal snapshot node, and the inode -+ * really has been deleted in all child snapshots, how does this get cleaned up? -+ * -+ * first there is the problem of how keys that have been overwritten in all -+ * child snapshots get deleted (unimplemented?), but inodes may perhaps be -+ * special? -+ * -+ * also: unlinked inode in internal snapshot appears to not be getting deleted -+ * correctly if inode doesn't exist in leaf snapshots -+ * -+ * solution: -+ * -+ * for a key in an interior snapshot node that needs work to be done that -+ * requires it to be mutated: iterate over all descendent leaf nodes and copy -+ * that key to snapshot leaf nodes, where we can mutate it -+ */ -+ -+static int snapshot_delete_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ snapshot_id_list *deleted, -+ snapshot_id_list *equiv_seen, -+ struct bpos *last_pos) -+{ -+ struct bch_fs *c = trans->c; -+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ if (!bkey_eq(k.k->p, *last_pos)) -+ equiv_seen->nr = 0; -+ *last_pos = k.k->p; -+ -+ if (snapshot_list_has_id(deleted, k.k->p.snapshot) || -+ snapshot_list_has_id(equiv_seen, equiv)) { -+ return bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ } else { -+ return snapshot_list_add(c, equiv_seen, equiv); -+ } -+} -+ -+static int move_key_to_correct_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); -+ -+ /* -+ * When we have a linear chain of snapshot nodes, we consider -+ * those to form an equivalence class: we're going to collapse -+ * them all down to a single node, and keep the leaf-most node - -+ * which has the same id as the equivalence class id. -+ * -+ * If there are multiple keys in different snapshots at the same -+ * position, we're only going to keep the one in the newest -+ * snapshot - the rest have been overwritten and are redundant, -+ * and for the key we're going to keep we need to move it to the -+ * equivalance class ID if it's not there already. -+ */ -+ if (equiv != k.k->p.snapshot) { -+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); -+ struct btree_iter new_iter; -+ int ret; -+ -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ return ret; -+ -+ new->k.p.snapshot = equiv; -+ -+ bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p, -+ BTREE_ITER_ALL_SNAPSHOTS| -+ BTREE_ITER_CACHED| -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(&new_iter) ?: -+ bch2_trans_update(trans, &new_iter, new, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: -+ bch2_btree_delete_at(trans, iter, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -+ bch2_trans_iter_exit(trans, &new_iter); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * For a given snapshot, if it doesn't have a subvolume that points to it, and -+ * it doesn't have child snapshot nodes - it's now redundant and we can mark it -+ * as deleted. -+ */ -+static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_snapshot snap; -+ u32 children[2]; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ if (BCH_SNAPSHOT_DELETED(snap.v) || -+ BCH_SNAPSHOT_SUBVOL(snap.v)) -+ return 0; -+ -+ children[0] = le32_to_cpu(snap.v->children[0]); -+ children[1] = le32_to_cpu(snap.v->children[1]); -+ -+ ret = bch2_snapshot_live(trans, children[0]) ?: -+ bch2_snapshot_live(trans, children[1]); -+ if (ret < 0) -+ return ret; -+ -+ if (!ret) -+ return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); -+ return 0; -+} -+ -+static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, -+ snapshot_id_list *skip) -+{ -+ rcu_read_lock(); -+ while (n--) { -+ do { -+ id = __bch2_snapshot_parent(c, id); -+ } while (snapshot_list_has_id(skip, id)); -+ } -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, -+ struct btree_iter *iter, struct bkey_s_c k, -+ snapshot_id_list *deleted) -+{ -+ struct bch_fs *c = trans->c; -+ u32 nr_deleted_ancestors = 0; -+ struct bkey_i_snapshot *s; -+ u32 *i; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_snapshot) -+ return 0; -+ -+ if (snapshot_list_has_id(deleted, k.k->p.offset)) -+ return 0; -+ -+ s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ return ret; -+ -+ darray_for_each(*deleted, i) -+ nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i); -+ -+ if (!nr_deleted_ancestors) -+ return 0; -+ -+ le32_add_cpu(&s->v.depth, -nr_deleted_ancestors); -+ -+ if (!s->v.depth) { -+ s->v.skip[0] = 0; -+ s->v.skip[1] = 0; -+ s->v.skip[2] = 0; -+ } else { -+ u32 depth = le32_to_cpu(s->v.depth); -+ u32 parent = bch2_snapshot_parent(c, s->k.p.offset); -+ -+ for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) { -+ u32 id = le32_to_cpu(s->v.skip[j]); -+ -+ if (snapshot_list_has_id(deleted, id)) { -+ id = depth > 1 -+ ? bch2_snapshot_nth_parent_skip(c, -+ parent, -+ get_random_u32_below(depth - 1), -+ deleted) -+ : parent; -+ s->v.skip[j] = cpu_to_le32(id); -+ } -+ } -+ -+ bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32); -+ } -+ -+ return bch2_trans_update(trans, iter, &s->k_i, 0); -+} -+ -+int bch2_delete_dead_snapshots(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_snapshot snap; -+ snapshot_id_list deleted = { 0 }; -+ snapshot_id_list deleted_interior = { 0 }; -+ u32 *i, id; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) { -+ ret = bch2_fs_read_write_early(c); -+ if (ret) { -+ bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); -+ return ret; -+ } -+ } -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * For every snapshot node: If we have no live children and it's not -+ * pointed to by a subvolume, delete it: -+ */ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ NULL, NULL, 0, -+ bch2_delete_redundant_snapshot(&trans, &iter, k)); -+ if (ret) { -+ bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ bch2_snapshot_set_equiv(&trans, k)); -+ if (ret) { -+ bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_snapshot) -+ continue; -+ -+ snap = bkey_s_c_to_snapshot(k); -+ if (BCH_SNAPSHOT_DELETED(snap.v)) { -+ ret = snapshot_list_add(c, &deleted, k.k->p.offset); -+ if (ret) -+ break; -+ } -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ if (ret) { -+ bch_err_msg(c, ret, "walking snapshots"); -+ goto err; -+ } -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ struct bpos last_pos = POS_MIN; -+ snapshot_id_list equiv_seen = { 0 }; -+ struct disk_reservation res = { 0 }; -+ -+ if (!btree_type_has_snapshots(id)) -+ continue; -+ -+ ret = for_each_btree_key_commit(&trans, iter, -+ id, POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ &res, NULL, BTREE_INSERT_NOFAIL, -+ snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?: -+ for_each_btree_key_commit(&trans, iter, -+ id, POS_MIN, -+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, -+ &res, NULL, BTREE_INSERT_NOFAIL, -+ move_key_to_correct_snapshot(&trans, &iter, k)); -+ -+ bch2_disk_reservation_put(c, &res); -+ darray_exit(&equiv_seen); -+ -+ if (ret) { -+ bch_err_msg(c, ret, "deleting keys from dying snapshots"); -+ goto err; -+ } -+ } -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, ret) { -+ u32 snapshot = k.k->p.offset; -+ u32 equiv = bch2_snapshot_equiv(c, snapshot); -+ -+ if (equiv != snapshot) -+ snapshot_list_add(c, &deleted_interior, snapshot); -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ /* -+ * Fixing children of deleted snapshots can't be done completely -+ * atomically, if we crash between here and when we delete the interior -+ * nodes some depth fields will be off: -+ */ -+ ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, POS_MIN, -+ BTREE_ITER_INTENT, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ bch2_fix_child_of_deleted_snapshot(&trans, &iter, k, &deleted_interior)); -+ if (ret) -+ goto err; -+ -+ darray_for_each(deleted, i) { -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_snapshot_node_delete(&trans, *i)); -+ if (ret) { -+ bch_err_msg(c, ret, "deleting snapshot %u", *i); -+ goto err; -+ } -+ } -+ -+ darray_for_each(deleted_interior, i) { -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_snapshot_node_delete(&trans, *i)); -+ if (ret) { -+ bch_err_msg(c, ret, "deleting snapshot %u", *i); -+ goto err; -+ } -+ } -+ -+ clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+err: -+ darray_exit(&deleted_interior); -+ darray_exit(&deleted); -+ bch2_trans_exit(&trans); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+void bch2_delete_dead_snapshots_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); -+ -+ if (test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) -+ bch2_delete_dead_snapshots(c); -+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+} -+ -+void bch2_delete_dead_snapshots_async(struct bch_fs *c) -+{ -+ if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && -+ !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); -+} -+ -+int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *h) -+{ -+ struct bch_fs *c = trans->c; -+ -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ -+ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) -+ return 0; -+ -+ bch2_delete_dead_snapshots_async(c); -+ return 0; -+} -+ -+int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_iter_init(trans, &iter, id, pos, -+ BTREE_ITER_NOT_EXTENTS| -+ BTREE_ITER_ALL_SNAPSHOTS); -+ while (1) { -+ k = bch2_btree_iter_prev(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ if (!k.k) -+ break; -+ -+ if (!bkey_eq(pos, k.k->p)) -+ break; -+ -+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { -+ ret = 1; -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+} -+ -+static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s = snapshot_t(c, id); -+ -+ return s->children[1] ?: s->children[0]; -+} -+ -+static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id) -+{ -+ u32 child; -+ -+ while ((child = bch2_snapshot_smallest_child(c, id))) -+ id = child; -+ return id; -+} -+ -+static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bkey_s_c interior_k, -+ u32 leaf_id, struct bpos *new_min_pos) -+{ -+ struct btree_iter iter; -+ struct bpos pos = interior_k.k->p; -+ struct bkey_s_c k; -+ struct bkey_i *new; -+ int ret; -+ -+ pos.snapshot = leaf_id; -+ -+ bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto out; -+ -+ /* key already overwritten in this snapshot? */ -+ if (k.k->p.snapshot != interior_k.k->p.snapshot) -+ goto out; -+ -+ if (bpos_eq(*new_min_pos, POS_MIN)) { -+ *new_min_pos = k.k->p; -+ new_min_pos->snapshot = leaf_id; -+ } -+ -+ new = bch2_bkey_make_mut_noupdate(trans, interior_k); -+ ret = PTR_ERR_OR_ZERO(new); -+ if (ret) -+ goto out; -+ -+ new->k.p.snapshot = leaf_id; -+ ret = bch2_trans_update(trans, &iter, new, 0); -+out: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans, -+ enum btree_id btree, -+ struct bkey_s_c k, -+ struct bpos *new_min_pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_buf sk; -+ int ret; -+ -+ bch2_bkey_buf_init(&sk); -+ bch2_bkey_buf_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ *new_min_pos = POS_MIN; -+ -+ for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot); -+ id < k.k->p.snapshot; -+ id++) { -+ if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) || -+ !bch2_snapshot_is_leaf(c, id)) -+ continue; -+ -+ ret = commit_do(trans, NULL, NULL, 0, -+ bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos)); -+ if (ret) -+ break; -+ } -+ -+ bch2_bkey_buf_exit(&sk, c); -+ return ret; -+} -+ -+int bch2_snapshots_read(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: -+ bch2_snapshot_set_equiv(&trans, k)) ?: -+ for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, -+ POS_MIN, 0, k, -+ (set_is_ancestor_bitmap(c, k.k->p.offset), 0))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+void bch2_fs_snapshots_exit(struct bch_fs *c) -+{ -+ kfree(rcu_dereference_protected(c->snapshots, true)); -+} -diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h -new file mode 100644 -index 000000000..dabc9b9d9 ---- /dev/null -+++ b/fs/bcachefs/snapshot.h -@@ -0,0 +1,272 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SNAPSHOT_H -+#define _BCACHEFS_SNAPSHOT_H -+ -+enum bkey_invalid_flags; -+ -+void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+ -+#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ -+ .key_invalid = bch2_snapshot_tree_invalid, \ -+ .val_to_text = bch2_snapshot_tree_to_text, \ -+ .min_val_size = 8, \ -+}) -+ -+struct bkey_i_snapshot_tree *__bch2_snapshot_tree_create(struct btree_trans *); -+ -+int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); -+ -+void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, -+ struct bkey_s_c, struct bkey_s_c, unsigned); -+ -+#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ -+ .key_invalid = bch2_snapshot_invalid, \ -+ .val_to_text = bch2_snapshot_to_text, \ -+ .atomic_trigger = bch2_mark_snapshot, \ -+ .min_val_size = 24, \ -+}) -+ -+static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) -+{ -+ return &t->s[U32_MAX - id]; -+} -+ -+static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) -+{ -+ return __snapshot_t(rcu_dereference(c->snapshots), id); -+} -+ -+static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = snapshot_t(c, id)->tree; -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -+{ -+ return snapshot_t(c, id)->parent; -+} -+ -+static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_parent_early(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ u32 parent = snapshot_t(c, id)->parent; -+ -+ if (parent && -+ snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) -+ panic("id %u depth=%u parent %u depth=%u\n", -+ id, snapshot_t(c, id)->depth, -+ parent, snapshot_t(c, parent)->depth); -+ -+ return parent; -+#else -+ return snapshot_t(c, id)->parent; -+#endif -+} -+ -+static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_parent(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) -+{ -+ rcu_read_lock(); -+ while (n--) -+ id = __bch2_snapshot_parent(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); -+ -+static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) -+{ -+ u32 parent; -+ -+ rcu_read_lock(); -+ while ((parent = __bch2_snapshot_parent(c, id))) -+ id = parent; -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+{ -+ return snapshot_t(c, id)->equiv; -+} -+ -+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) -+{ -+ rcu_read_lock(); -+ id = __bch2_snapshot_equiv(c, id); -+ rcu_read_unlock(); -+ -+ return id; -+} -+ -+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) -+{ -+ return id == bch2_snapshot_equiv(c, id); -+} -+ -+static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ bool ret; -+ -+ rcu_read_lock(); -+ s = snapshot_t(c, id); -+ ret = s->children[0]; -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) -+{ -+ return !bch2_snapshot_is_internal_node(c, id); -+} -+ -+static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *s; -+ u32 parent = __bch2_snapshot_parent(c, id); -+ -+ if (!parent) -+ return 0; -+ -+ s = snapshot_t(c, __bch2_snapshot_parent(c, id)); -+ if (id == s->children[0]) -+ return s->children[1]; -+ if (id == s->children[1]) -+ return s->children[0]; -+ return 0; -+} -+ -+static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) -+{ -+ u32 depth; -+ -+ rcu_read_lock(); -+ depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; -+ rcu_read_unlock(); -+ -+ return depth; -+} -+ -+bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); -+ -+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ return id == ancestor -+ ? true -+ : __bch2_snapshot_is_ancestor(c, id, ancestor); -+} -+ -+static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) -+{ -+ const struct snapshot_t *t; -+ bool ret; -+ -+ rcu_read_lock(); -+ t = snapshot_t(c, id); -+ ret = (t->children[0]|t->children[1]) != 0; -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) -+{ -+ u32 *i; -+ -+ darray_for_each(*s, i) -+ if (*i == id) -+ return true; -+ return false; -+} -+ -+static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) -+{ -+ u32 *i; -+ -+ darray_for_each(*s, i) -+ if (bch2_snapshot_is_ancestor(c, id, *i)) -+ return true; -+ return false; -+} -+ -+static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) -+{ -+ int ret; -+ -+ BUG_ON(snapshot_list_has_id(s, id)); -+ ret = darray_push(s, id); -+ if (ret) -+ bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); -+ return ret; -+} -+ -+int bch2_snapshot_lookup(struct btree_trans *trans, u32 id, -+ struct bch_snapshot *s); -+int bch2_snapshot_get_subvol(struct btree_trans *, u32, -+ struct bch_subvolume *); -+int bch2_snapshot_live(struct btree_trans *trans, u32 id); -+int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k); -+ -+/* only exported for tests: */ -+int bch2_snapshot_node_create(struct btree_trans *, u32, -+ u32 *, u32 *, unsigned); -+ -+int bch2_check_snapshot_trees(struct bch_fs *); -+int bch2_check_snapshots(struct bch_fs *); -+ -+int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -+int bch2_delete_dead_snapshots_hook(struct btree_trans *, -+ struct btree_trans_commit_hook *); -+void bch2_delete_dead_snapshots_work(struct work_struct *); -+ -+int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); -+ -+static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, -+ enum btree_id id, -+ struct bpos pos) -+{ -+ if (!btree_type_has_snapshots(id) || -+ bch2_snapshot_is_leaf(trans->c, pos.snapshot)) -+ return 0; -+ -+ return __bch2_key_has_snapshot_overwrites(trans, id, pos); -+} -+ -+int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id, -+ struct bkey_s_c, struct bpos *); -+ -+int bch2_snapshots_read(struct bch_fs *); -+void bch2_fs_snapshots_exit(struct bch_fs *); -+ -+#endif /* _BCACHEFS_SNAPSHOT_H */ -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -new file mode 100644 -index 000000000..ae21a8cca ---- /dev/null -+++ b/fs/bcachefs/str_hash.h -@@ -0,0 +1,370 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_STR_HASH_H -+#define _BCACHEFS_STR_HASH_H -+ -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "checksum.h" -+#include "error.h" -+#include "inode.h" -+#include "siphash.h" -+#include "subvolume.h" -+#include "super.h" -+ -+#include -+#include -+#include -+ -+static inline enum bch_str_hash_type -+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -+{ -+ switch (opt) { -+ case BCH_STR_HASH_OPT_crc32c: -+ return BCH_STR_HASH_crc32c; -+ case BCH_STR_HASH_OPT_crc64: -+ return BCH_STR_HASH_crc64; -+ case BCH_STR_HASH_OPT_siphash: -+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) -+ ? BCH_STR_HASH_siphash -+ : BCH_STR_HASH_siphash_old; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_info { -+ u8 type; -+ /* -+ * For crc32 or crc64 string hashes the first key value of -+ * the siphash_key (k0) is used as the key. -+ */ -+ SIPHASH_KEY siphash_key; -+}; -+ -+static inline struct bch_hash_info -+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -+{ -+ /* XXX ick */ -+ struct bch_hash_info info = { -+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & -+ ~(~0U << INODE_STR_HASH_BITS), -+ .siphash_key = { .k0 = bi->bi_hash_seed } -+ }; -+ -+ if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { -+ SHASH_DESC_ON_STACK(desc, c->sha256); -+ u8 digest[SHA256_DIGEST_SIZE]; -+ -+ desc->tfm = c->sha256; -+ -+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, -+ sizeof(bi->bi_hash_seed), digest); -+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); -+ } -+ -+ return info; -+} -+ -+struct bch_str_hash_ctx { -+ union { -+ u32 crc32c; -+ u64 crc64; -+ SIPHASH_CTX siphash; -+ }; -+}; -+ -+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_crc32c: -+ ctx->crc32c = crc32c(~0, &info->siphash_key.k0, -+ sizeof(info->siphash_key.k0)); -+ break; -+ case BCH_STR_HASH_crc64: -+ ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, -+ sizeof(info->siphash_key.k0)); -+ break; -+ case BCH_STR_HASH_siphash_old: -+ case BCH_STR_HASH_siphash: -+ SipHash24_Init(&ctx->siphash, &info->siphash_key); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info, -+ const void *data, size_t len) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_crc32c: -+ ctx->crc32c = crc32c(ctx->crc32c, data, len); -+ break; -+ case BCH_STR_HASH_crc64: -+ ctx->crc64 = crc64_be(ctx->crc64, data, len); -+ break; -+ case BCH_STR_HASH_siphash_old: -+ case BCH_STR_HASH_siphash: -+ SipHash24_Update(&ctx->siphash, data, len); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_crc32c: -+ return ctx->crc32c; -+ case BCH_STR_HASH_crc64: -+ return ctx->crc64 >> 1; -+ case BCH_STR_HASH_siphash_old: -+ case BCH_STR_HASH_siphash: -+ return SipHash24_End(&ctx->siphash) >> 1; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_desc { -+ enum btree_id btree_id; -+ u8 key_type; -+ -+ u64 (*hash_key)(const struct bch_hash_info *, const void *); -+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); -+ bool (*cmp_key)(struct bkey_s_c, const void *); -+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); -+ bool (*is_visible)(subvol_inum inum, struct bkey_s_c); -+}; -+ -+static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) -+{ -+ return k.k->type == desc.key_type && -+ (!desc.is_visible || -+ !inum.inum || -+ desc.is_visible(inum, k)); -+} -+ -+static __always_inline int -+bch2_hash_lookup(struct btree_trans *trans, -+ struct btree_iter *iter, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ subvol_inum inum, const void *key, -+ unsigned flags) -+{ -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, -+ SPOS(inum.inum, desc.hash_key(info, key), snapshot), -+ POS(inum.inum, U64_MAX), -+ BTREE_ITER_SLOTS|flags, k, ret) { -+ if (is_visible_key(desc, inum, k)) { -+ if (!desc.cmp_key(k, key)) -+ return 0; -+ } else if (k.k->type == KEY_TYPE_hash_whiteout) { -+ ; -+ } else { -+ /* hole, not found */ -+ break; -+ } -+ } -+ bch2_trans_iter_exit(trans, iter); -+ -+ return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; -+} -+ -+static __always_inline int -+bch2_hash_hole(struct btree_trans *trans, -+ struct btree_iter *iter, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ subvol_inum inum, const void *key) -+{ -+ struct bkey_s_c k; -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, -+ SPOS(inum.inum, desc.hash_key(info, key), snapshot), -+ POS(inum.inum, U64_MAX), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) -+ if (!is_visible_key(desc, inum, k)) -+ return 0; -+ bch2_trans_iter_exit(trans, iter); -+ -+ return ret ?: -BCH_ERR_ENOSPC_str_hash_create; -+} -+ -+static __always_inline -+int bch2_hash_needs_whiteout(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *start) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_copy_iter(&iter, start); -+ -+ bch2_btree_iter_advance(&iter); -+ -+ for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->type != desc.key_type && -+ k.k->type != KEY_TYPE_hash_whiteout) -+ break; -+ -+ if (k.k->type == desc.key_type && -+ desc.hash_bkey(info, k) <= start->pos.offset) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static __always_inline -+int bch2_hash_set_snapshot(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ subvol_inum inum, u32 snapshot, -+ struct bkey_i *insert, -+ int flags, -+ int update_flags) -+{ -+ struct btree_iter iter, slot = { NULL }; -+ struct bkey_s_c k; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, -+ SPOS(insert->k.p.inode, -+ desc.hash_bkey(info, bkey_i_to_s_c(insert)), -+ snapshot), -+ POS(insert->k.p.inode, U64_MAX), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (is_visible_key(desc, inum, k)) { -+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) -+ goto found; -+ -+ /* hash collision: */ -+ continue; -+ } -+ -+ if (!slot.path && -+ !(flags & BCH_HASH_SET_MUST_REPLACE)) -+ bch2_trans_copy_iter(&slot, &iter); -+ -+ if (k.k->type != KEY_TYPE_hash_whiteout) -+ goto not_found; -+ } -+ -+ if (!ret) -+ ret = -BCH_ERR_ENOSPC_str_hash_create; -+out: -+ bch2_trans_iter_exit(trans, &slot); -+ bch2_trans_iter_exit(trans, &iter); -+ -+ return ret; -+found: -+ found = true; -+not_found: -+ -+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { -+ ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; -+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { -+ ret = -EEXIST; -+ } else { -+ if (!found && slot.path) -+ swap(iter, slot); -+ -+ insert->k.p = iter.pos; -+ ret = bch2_trans_update(trans, &iter, insert, 0); -+ } -+ -+ goto out; -+} -+ -+static __always_inline -+int bch2_hash_set(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ subvol_inum inum, -+ struct bkey_i *insert, int flags) -+{ -+ u32 snapshot; -+ int ret; -+ -+ ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); -+ if (ret) -+ return ret; -+ -+ insert->k.p.inode = inum.inum; -+ -+ return bch2_hash_set_snapshot(trans, desc, info, inum, -+ snapshot, insert, flags, 0); -+} -+ -+static __always_inline -+int bch2_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *iter, -+ unsigned update_flags) -+{ -+ struct bkey_i *delete; -+ int ret; -+ -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ ret = PTR_ERR_OR_ZERO(delete); -+ if (ret) -+ return ret; -+ -+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); -+ if (ret < 0) -+ return ret; -+ -+ bkey_init(&delete->k); -+ delete->k.p = iter->pos; -+ delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; -+ -+ return bch2_trans_update(trans, iter, delete, update_flags); -+} -+ -+static __always_inline -+int bch2_hash_delete(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ subvol_inum inum, const void *key) -+{ -+ struct btree_iter iter; -+ int ret; -+ -+ ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, -+ BTREE_ITER_INTENT); -+ if (ret) -+ return ret; -+ -+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+#endif /* _BCACHEFS_STR_HASH_H */ -diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c -new file mode 100644 -index 000000000..0214a98de ---- /dev/null -+++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,451 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "errcode.h" -+#include "error.h" -+#include "fs.h" -+#include "snapshot.h" -+#include "subvolume.h" -+ -+#include -+ -+static int bch2_subvolume_delete(struct btree_trans *, u32); -+ -+static int check_subvol(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c_subvolume subvol; -+ struct bch_snapshot snapshot; -+ unsigned snapid; -+ int ret = 0; -+ -+ if (k.k->type != KEY_TYPE_subvolume) -+ return 0; -+ -+ subvol = bkey_s_c_to_subvolume(k); -+ snapid = le32_to_cpu(subvol.v->snapshot); -+ ret = bch2_snapshot_lookup(trans, snapid, &snapshot); -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ bch_err(c, "subvolume %llu points to nonexistent snapshot %u", -+ k.k->p.offset, snapid); -+ if (ret) -+ return ret; -+ -+ if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { -+ bch2_fs_lazy_rw(c); -+ -+ ret = bch2_subvolume_delete(trans, iter->pos.offset); -+ if (ret) -+ bch_err(c, "error deleting subvolume %llu: %s", -+ iter->pos.offset, bch2_err_str(ret)); -+ return ret ?: -BCH_ERR_transaction_restart_nested; -+ } -+ -+ if (!BCH_SUBVOLUME_SNAP(subvol.v)) { -+ u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); -+ u32 snapshot_tree; -+ struct bch_snapshot_tree st; -+ -+ rcu_read_lock(); -+ snapshot_tree = snapshot_t(c, snapshot_root)->tree; -+ rcu_read_unlock(); -+ -+ ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); -+ -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "%s: snapshot tree %u not found", __func__, snapshot_tree); -+ -+ if (ret) -+ return ret; -+ -+ if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, -+ "subvolume %llu is not set as snapshot but is not master subvolume", -+ k.k->p.offset)) { -+ struct bkey_i_subvolume *s = -+ bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ return ret; -+ -+ SET_BCH_SUBVOLUME_SNAP(&s->v, true); -+ } -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+int bch2_check_subvols(struct bch_fs *c) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, -+ check_subvol(&trans, &iter, k))); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+/* Subvolumes: */ -+ -+int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) -+{ -+ if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || -+ bkey_gt(k.k->p, SUBVOL_POS_MAX)) { -+ prt_printf(err, "invalid pos"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); -+ -+ prt_printf(out, "root %llu snapshot id %u", -+ le64_to_cpu(s.v->inode), -+ le32_to_cpu(s.v->snapshot)); -+ -+ if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) -+ prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); -+} -+ -+static __always_inline int -+bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, -+ bool inconsistent_if_not_found, -+ int iter_flags, -+ struct bch_subvolume *s) -+{ -+ int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), -+ iter_flags, subvolume, s); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && -+ inconsistent_if_not_found, -+ trans->c, "missing subvolume %u", subvol); -+ return ret; -+} -+ -+int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, -+ bool inconsistent_if_not_found, -+ int iter_flags, -+ struct bch_subvolume *s) -+{ -+ return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); -+} -+ -+int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, -+ struct bch_subvolume *subvol) -+{ -+ struct bch_snapshot snap; -+ -+ return bch2_snapshot_lookup(trans, snapshot, &snap) ?: -+ bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); -+} -+ -+int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, -+ u32 *snapid) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c_subvolume subvol; -+ int ret; -+ -+ subvol = bch2_bkey_get_iter_typed(trans, &iter, -+ BTREE_ID_subvolumes, POS(0, subvolid), -+ BTREE_ITER_CACHED|BTREE_ITER_WITH_UPDATES, -+ subvolume); -+ ret = bkey_err(subvol); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -+ "missing subvolume %u", subvolid); -+ -+ if (likely(!ret)) -+ *snapid = le32_to_cpu(subvol.v->snapshot); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_subvolume_reparent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, -+ u32 old_parent, u32 new_parent) -+{ -+ struct bkey_i_subvolume *s; -+ int ret; -+ -+ if (k.k->type != KEY_TYPE_subvolume) -+ return 0; -+ -+ if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && -+ le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) -+ return 0; -+ -+ s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ return ret; -+ -+ s->v.parent = cpu_to_le32(new_parent); -+ return 0; -+} -+ -+/* -+ * Separate from the snapshot tree in the snapshots btree, we record the tree -+ * structure of how snapshot subvolumes were created - the parent subvolume of -+ * each snapshot subvolume. -+ * -+ * When a subvolume is deleted, we scan for child subvolumes and reparant them, -+ * to avoid dangling references: -+ */ -+static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_subvolume s; -+ -+ return lockrestart_do(trans, -+ bch2_subvolume_get(trans, subvolid_to_delete, true, -+ BTREE_ITER_CACHED, &s)) ?: -+ for_each_btree_key_commit(trans, iter, -+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, -+ NULL, NULL, BTREE_INSERT_NOFAIL, -+ bch2_subvolume_reparent(trans, &iter, k, -+ subvolid_to_delete, le32_to_cpu(s.parent))); -+} -+ -+/* -+ * Delete subvolume, mark snapshot ID as deleted, queue up snapshot -+ * deletion/cleanup: -+ */ -+static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c_subvolume subvol; -+ struct btree_trans_commit_hook *h; -+ u32 snapid; -+ int ret = 0; -+ -+ subvol = bch2_bkey_get_iter_typed(trans, &iter, -+ BTREE_ID_subvolumes, POS(0, subvolid), -+ BTREE_ITER_CACHED|BTREE_ITER_INTENT, -+ subvolume); -+ ret = bkey_err(subvol); -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -+ "missing subvolume %u", subvolid); -+ if (ret) -+ return ret; -+ -+ snapid = le32_to_cpu(subvol.v->snapshot); -+ -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+ if (ret) -+ goto err; -+ -+ ret = bch2_snapshot_node_set_deleted(trans, snapid); -+ if (ret) -+ goto err; -+ -+ h = bch2_trans_kmalloc(trans, sizeof(*h)); -+ ret = PTR_ERR_OR_ZERO(h); -+ if (ret) -+ goto err; -+ -+ h->fn = bch2_delete_dead_snapshots_hook; -+ bch2_trans_commit_hook(trans, h); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) -+{ -+ return bch2_subvolumes_reparent(trans, subvolid) ?: -+ commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, -+ __bch2_subvolume_delete(trans, subvolid)); -+} -+ -+static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ snapshot_wait_for_pagecache_and_delete_work); -+ snapshot_id_list s; -+ u32 *id; -+ int ret = 0; -+ -+ while (!ret) { -+ mutex_lock(&c->snapshots_unlinked_lock); -+ s = c->snapshots_unlinked; -+ darray_init(&c->snapshots_unlinked); -+ mutex_unlock(&c->snapshots_unlinked_lock); -+ -+ if (!s.nr) -+ break; -+ -+ bch2_evict_subvolume_inodes(c, &s); -+ -+ for (id = s.data; id < s.data + s.nr; id++) { -+ ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id)); -+ if (ret) { -+ bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); -+ break; -+ } -+ } -+ -+ darray_exit(&s); -+ } -+ -+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); -+} -+ -+struct subvolume_unlink_hook { -+ struct btree_trans_commit_hook h; -+ u32 subvol; -+}; -+ -+static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, -+ struct btree_trans_commit_hook *_h) -+{ -+ struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); -+ struct bch_fs *c = trans->c; -+ int ret = 0; -+ -+ mutex_lock(&c->snapshots_unlinked_lock); -+ if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) -+ ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); -+ mutex_unlock(&c->snapshots_unlinked_lock); -+ -+ if (ret) -+ return ret; -+ -+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) -+ return -EROFS; -+ -+ if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) -+ bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); -+ return 0; -+} -+ -+int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) -+{ -+ struct btree_iter iter; -+ struct bkey_i_subvolume *n; -+ struct subvolume_unlink_hook *h; -+ int ret = 0; -+ -+ h = bch2_trans_kmalloc(trans, sizeof(*h)); -+ ret = PTR_ERR_OR_ZERO(h); -+ if (ret) -+ return ret; -+ -+ h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; -+ h->subvol = subvolid; -+ bch2_trans_commit_hook(trans, &h->h); -+ -+ n = bch2_bkey_get_mut_typed(trans, &iter, -+ BTREE_ID_subvolumes, POS(0, subvolid), -+ BTREE_ITER_CACHED, subvolume); -+ ret = PTR_ERR_OR_ZERO(n); -+ if (unlikely(ret)) { -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, -+ "missing subvolume %u", subvolid); -+ return ret; -+ } -+ -+ SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+int bch2_subvolume_create(struct btree_trans *trans, u64 inode, -+ u32 src_subvolid, -+ u32 *new_subvolid, -+ u32 *new_snapshotid, -+ bool ro) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; -+ struct bkey_i_subvolume *new_subvol = NULL; -+ struct bkey_i_subvolume *src_subvol = NULL; -+ u32 parent = 0, new_nodes[2], snapshot_subvols[2]; -+ int ret = 0; -+ -+ ret = bch2_bkey_get_empty_slot(trans, &dst_iter, -+ BTREE_ID_subvolumes, POS(0, U32_MAX)); -+ if (ret == -BCH_ERR_ENOSPC_btree_slot) -+ ret = -BCH_ERR_ENOSPC_subvolume_create; -+ if (ret) -+ return ret; -+ -+ snapshot_subvols[0] = dst_iter.pos.offset; -+ snapshot_subvols[1] = src_subvolid; -+ -+ if (src_subvolid) { -+ /* Creating a snapshot: */ -+ -+ src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, -+ BTREE_ID_subvolumes, POS(0, src_subvolid), -+ BTREE_ITER_CACHED, subvolume); -+ ret = PTR_ERR_OR_ZERO(src_subvol); -+ if (unlikely(ret)) { -+ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, -+ "subvolume %u not found", src_subvolid); -+ goto err; -+ } -+ -+ parent = le32_to_cpu(src_subvol->v.snapshot); -+ } -+ -+ ret = bch2_snapshot_node_create(trans, parent, new_nodes, -+ snapshot_subvols, -+ src_subvolid ? 2 : 1); -+ if (ret) -+ goto err; -+ -+ if (src_subvolid) { -+ src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); -+ ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); -+ if (ret) -+ goto err; -+ } -+ -+ new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); -+ ret = PTR_ERR_OR_ZERO(new_subvol); -+ if (ret) -+ goto err; -+ -+ new_subvol->v.flags = 0; -+ new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); -+ new_subvol->v.inode = cpu_to_le64(inode); -+ new_subvol->v.parent = cpu_to_le32(src_subvolid); -+ new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); -+ new_subvol->v.otime.hi = 0; -+ -+ SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); -+ SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); -+ -+ *new_subvolid = new_subvol->k.p.offset; -+ *new_snapshotid = new_nodes[0]; -+err: -+ bch2_trans_iter_exit(trans, &src_iter); -+ bch2_trans_iter_exit(trans, &dst_iter); -+ return ret; -+} -+ -+int bch2_fs_subvolumes_init(struct bch_fs *c) -+{ -+ INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); -+ INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, -+ bch2_subvolume_wait_for_pagecache_and_delete); -+ mutex_init(&c->snapshots_unlinked_lock); -+ return 0; -+} -diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h -new file mode 100644 -index 000000000..8d4c50f4c ---- /dev/null -+++ b/fs/bcachefs/subvolume.h -@@ -0,0 +1,35 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUBVOLUME_H -+#define _BCACHEFS_SUBVOLUME_H -+ -+#include "darray.h" -+#include "subvolume_types.h" -+ -+enum bkey_invalid_flags; -+ -+int bch2_check_subvols(struct bch_fs *); -+ -+int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); -+void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ -+ .key_invalid = bch2_subvolume_invalid, \ -+ .val_to_text = bch2_subvolume_to_text, \ -+ .min_val_size = 16, \ -+}) -+ -+int bch2_subvolume_get(struct btree_trans *, unsigned, -+ bool, int, struct bch_subvolume *); -+int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); -+ -+int bch2_delete_dead_snapshots(struct bch_fs *); -+void bch2_delete_dead_snapshots_async(struct bch_fs *); -+ -+int bch2_subvolume_unlink(struct btree_trans *, u32); -+int bch2_subvolume_create(struct btree_trans *, u64, u32, -+ u32 *, u32 *, bool); -+ -+int bch2_fs_subvolumes_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_SUBVOLUME_H */ -diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h -new file mode 100644 -index 000000000..86833445a ---- /dev/null -+++ b/fs/bcachefs/subvolume_types.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUBVOLUME_TYPES_H -+#define _BCACHEFS_SUBVOLUME_TYPES_H -+ -+#include "darray.h" -+ -+typedef DARRAY(u32) snapshot_id_list; -+ -+#define IS_ANCESTOR_BITMAP 128 -+ -+struct snapshot_t { -+ u32 parent; -+ u32 skip[3]; -+ u32 depth; -+ u32 children[2]; -+ u32 subvol; /* Nonzero only if a subvolume points to this node: */ -+ u32 tree; -+ u32 equiv; -+ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -+}; -+ -+struct snapshot_table { -+ struct snapshot_t s[0]; -+}; -+ -+typedef struct { -+ u32 subvol; -+ u64 inum; -+} subvol_inum; -+ -+#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -new file mode 100644 -index 000000000..f01883e78 ---- /dev/null -+++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1265 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "checksum.h" -+#include "counters.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_sb.h" -+#include "journal_seq_blacklist.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "quota.h" -+#include "sb-clean.h" -+#include "sb-members.h" -+#include "super-io.h" -+#include "super.h" -+#include "trace.h" -+#include "vstructs.h" -+ -+#include -+#include -+ -+static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -+}; -+ -+struct bch2_metadata_version { -+ u16 version; -+ const char *name; -+ u64 recovery_passes; -+}; -+ -+static const struct bch2_metadata_version bch2_metadata_versions[] = { -+#define x(n, v, _recovery_passes) { \ -+ .version = v, \ -+ .name = #n, \ -+ .recovery_passes = _recovery_passes, \ -+}, -+ BCH_METADATA_VERSIONS() -+#undef x -+}; -+ -+void bch2_version_to_text(struct printbuf *out, unsigned v) -+{ -+ const char *str = "(unknown version)"; -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) -+ if (bch2_metadata_versions[i].version == v) { -+ str = bch2_metadata_versions[i].name; -+ break; -+ } -+ -+ prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); -+} -+ -+unsigned bch2_latest_compatible_version(unsigned v) -+{ -+ if (!BCH_VERSION_MAJOR(v)) -+ return v; -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) -+ if (bch2_metadata_versions[i].version > v && -+ BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == -+ BCH_VERSION_MAJOR(v)) -+ v = bch2_metadata_versions[i].version; -+ -+ return v; -+} -+ -+u64 bch2_upgrade_recovery_passes(struct bch_fs *c, -+ unsigned old_version, -+ unsigned new_version) -+{ -+ u64 ret = 0; -+ -+ for (const struct bch2_metadata_version *i = bch2_metadata_versions; -+ i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); -+ i++) -+ if (i->version > old_version && i->version <= new_version) { -+ if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) -+ ret |= bch2_fsck_recovery_passes(); -+ ret |= i->recovery_passes; -+ } -+ -+ return ret &= ~RECOVERY_PASS_ALL_FSCK; -+} -+ -+const char * const bch2_sb_fields[] = { -+#define x(name, nr) #name, -+ BCH_SB_FIELDS() -+#undef x -+ NULL -+}; -+ -+static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, -+ struct printbuf *); -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f; -+ -+ /* XXX: need locking around superblock to access optional fields */ -+ -+ vstruct_for_each(sb, f) -+ if (le32_to_cpu(f->type) == type) -+ return f; -+ return NULL; -+} -+ -+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, -+ struct bch_sb_field *f, -+ unsigned u64s) -+{ -+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; -+ -+ BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); -+ -+ if (!f && !u64s) { -+ /* nothing to do: */ -+ } else if (!f) { -+ f = vstruct_last(sb->sb); -+ memset(f, 0, sizeof(u64) * u64s); -+ f->u64s = cpu_to_le32(u64s); -+ f->type = 0; -+ } else { -+ void *src, *dst; -+ -+ src = vstruct_end(f); -+ -+ if (u64s) { -+ f->u64s = cpu_to_le32(u64s); -+ dst = vstruct_end(f); -+ } else { -+ dst = f; -+ } -+ -+ memmove(dst, src, vstruct_end(sb->sb) - src); -+ -+ if (dst > src) -+ memset(src, 0, dst - src); -+ } -+ -+ sb->sb->u64s = cpu_to_le32(sb_u64s); -+ -+ return u64s ? f : NULL; -+} -+ -+void bch2_sb_field_delete(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ -+ if (f) -+ __bch2_sb_field_resize(sb, f, 0); -+} -+ -+/* Superblock realloc/free: */ -+ -+void bch2_free_super(struct bch_sb_handle *sb) -+{ -+ kfree(sb->bio); -+ if (!IS_ERR_OR_NULL(sb->bdev)) -+ blkdev_put(sb->bdev, sb->holder); -+ kfree(sb->holder); -+ -+ kfree(sb->sb); -+ memset(sb, 0, sizeof(*sb)); -+} -+ -+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -+{ -+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); -+ size_t new_buffer_size; -+ struct bch_sb *new_sb; -+ struct bio *bio; -+ -+ if (sb->bdev) -+ new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); -+ -+ new_buffer_size = roundup_pow_of_two(new_bytes); -+ -+ if (sb->sb && sb->buffer_size >= new_buffer_size) -+ return 0; -+ -+ if (sb->have_layout) { -+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; -+ -+ if (new_bytes > max_bytes) { -+ pr_err("%pg: superblock too big: want %zu but have %llu", -+ sb->bdev, new_bytes, max_bytes); -+ return -BCH_ERR_ENOSPC_sb; -+ } -+ } -+ -+ if (sb->buffer_size >= new_buffer_size && sb->sb) -+ return 0; -+ -+ if (dynamic_fault("bcachefs:add:super_realloc")) -+ return -BCH_ERR_ENOMEM_sb_realloc_injected; -+ -+ new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); -+ if (!new_sb) -+ return -BCH_ERR_ENOMEM_sb_buf_realloc; -+ -+ sb->sb = new_sb; -+ -+ if (sb->have_bio) { -+ unsigned nr_bvecs = buf_pages(sb->sb, new_buffer_size); -+ -+ bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); -+ if (!bio) -+ return -BCH_ERR_ENOMEM_sb_bio_realloc; -+ -+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); -+ -+ kfree(sb->bio); -+ sb->bio = bio; -+ } -+ -+ sb->buffer_size = new_buffer_size; -+ -+ return 0; -+} -+ -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type, -+ unsigned u64s) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ ssize_t d = -old_u64s + u64s; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) -+ return NULL; -+ -+ if (sb->fs_sb) { -+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ /* XXX: we're not checking that offline device have enough space */ -+ -+ for_each_online_member(ca, c, i) { -+ struct bch_sb_handle *sb = &ca->disk_sb; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { -+ percpu_ref_put(&ca->ref); -+ return NULL; -+ } -+ } -+ } -+ -+ f = bch2_sb_field_get(sb->sb, type); -+ f = __bch2_sb_field_resize(sb, f, u64s); -+ if (f) -+ f->type = cpu_to_le32(type); -+ return f; -+} -+ -+/* Superblock validate: */ -+ -+static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) -+{ -+ u64 offset, prev_offset, max_sectors; -+ unsigned i; -+ -+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -+ -+ if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && -+ !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { -+ prt_printf(out, "Not a bcachefs superblock layout"); -+ return -BCH_ERR_invalid_sb_layout; -+ } -+ -+ if (layout->layout_type != 0) { -+ prt_printf(out, "Invalid superblock layout type %u", -+ layout->layout_type); -+ return -BCH_ERR_invalid_sb_layout_type; -+ } -+ -+ if (!layout->nr_superblocks) { -+ prt_printf(out, "Invalid superblock layout: no superblocks"); -+ return -BCH_ERR_invalid_sb_layout_nr_superblocks; -+ } -+ -+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { -+ prt_printf(out, "Invalid superblock layout: too many superblocks"); -+ return -BCH_ERR_invalid_sb_layout_nr_superblocks; -+ } -+ -+ max_sectors = 1 << layout->sb_max_size_bits; -+ -+ prev_offset = le64_to_cpu(layout->sb_offset[0]); -+ -+ for (i = 1; i < layout->nr_superblocks; i++) { -+ offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset < prev_offset + max_sectors) { -+ prt_printf(out, "Invalid superblock layout: superblocks overlap\n" -+ " (sb %u ends at %llu next starts at %llu", -+ i - 1, prev_offset + max_sectors, offset); -+ return -BCH_ERR_invalid_sb_layout_superblocks_overlap; -+ } -+ prev_offset = offset; -+ } -+ -+ return 0; -+} -+ -+static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) -+{ -+ u16 version = le16_to_cpu(sb->version); -+ u16 version_min = le16_to_cpu(sb->version_min); -+ -+ if (!bch2_version_compatible(version)) { -+ prt_str(out, "Unsupported superblock version "); -+ bch2_version_to_text(out, version); -+ prt_str(out, " (min "); -+ bch2_version_to_text(out, bcachefs_metadata_version_min); -+ prt_str(out, ", max "); -+ bch2_version_to_text(out, bcachefs_metadata_version_current); -+ prt_str(out, ")"); -+ return -BCH_ERR_invalid_sb_version; -+ } -+ -+ if (!bch2_version_compatible(version_min)) { -+ prt_str(out, "Unsupported superblock version_min "); -+ bch2_version_to_text(out, version_min); -+ prt_str(out, " (min "); -+ bch2_version_to_text(out, bcachefs_metadata_version_min); -+ prt_str(out, ", max "); -+ bch2_version_to_text(out, bcachefs_metadata_version_current); -+ prt_str(out, ")"); -+ return -BCH_ERR_invalid_sb_version; -+ } -+ -+ if (version_min > version) { -+ prt_str(out, "Bad minimum version "); -+ bch2_version_to_text(out, version_min); -+ prt_str(out, ", greater than version field "); -+ bch2_version_to_text(out, version); -+ return -BCH_ERR_invalid_sb_version; -+ } -+ -+ return 0; -+} -+ -+static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, -+ int rw) -+{ -+ struct bch_sb *sb = disk_sb->sb; -+ struct bch_sb_field *f; -+ struct bch_sb_field_members *mi; -+ enum bch_opt_id opt_id; -+ u16 block_size; -+ int ret; -+ -+ ret = bch2_sb_compatible(sb, out); -+ if (ret) -+ return ret; -+ -+ if (sb->features[1] || -+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { -+ prt_printf(out, "Filesystem has incompatible features"); -+ return -BCH_ERR_invalid_sb_features; -+ } -+ -+ block_size = le16_to_cpu(sb->block_size); -+ -+ if (block_size > PAGE_SECTORS) { -+ prt_printf(out, "Block size too big (got %u, max %u)", -+ block_size, PAGE_SECTORS); -+ return -BCH_ERR_invalid_sb_block_size; -+ } -+ -+ if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { -+ prt_printf(out, "Bad user UUID (got zeroes)"); -+ return -BCH_ERR_invalid_sb_uuid; -+ } -+ -+ if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { -+ prt_printf(out, "Bad intenal UUID (got zeroes)"); -+ return -BCH_ERR_invalid_sb_uuid; -+ } -+ -+ if (!sb->nr_devices || -+ sb->nr_devices > BCH_SB_MEMBERS_MAX) { -+ prt_printf(out, "Bad number of member devices %u (max %u)", -+ sb->nr_devices, BCH_SB_MEMBERS_MAX); -+ return -BCH_ERR_invalid_sb_too_many_members; -+ } -+ -+ if (sb->dev_idx >= sb->nr_devices) { -+ prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", -+ sb->dev_idx, sb->nr_devices); -+ return -BCH_ERR_invalid_sb_dev_idx; -+ } -+ -+ if (!sb->time_precision || -+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { -+ prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", -+ le32_to_cpu(sb->time_precision), NSEC_PER_SEC); -+ return -BCH_ERR_invalid_sb_time_precision; -+ } -+ -+ if (rw == READ) { -+ /* -+ * Been seeing a bug where these are getting inexplicably -+ * zeroed, so we're now validating them, but we have to be -+ * careful not to preven people's filesystems from mounting: -+ */ -+ if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) -+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); -+ if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) -+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); -+ -+ if (!BCH_SB_VERSION_UPGRADE_COMPLETE(sb)) -+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(sb, le16_to_cpu(sb->version)); -+ } -+ -+ for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { -+ const struct bch_option *opt = bch2_opt_table + opt_id; -+ -+ if (opt->get_sb != BCH2_NO_SB_OPT) { -+ u64 v = bch2_opt_from_sb(sb, opt_id); -+ -+ prt_printf(out, "Invalid option "); -+ ret = bch2_opt_validate(opt, v, out); -+ if (ret) -+ return ret; -+ -+ printbuf_reset(out); -+ } -+ } -+ -+ /* validate layout */ -+ ret = validate_sb_layout(&sb->layout, out); -+ if (ret) -+ return ret; -+ -+ vstruct_for_each(sb, f) { -+ if (!f->u64s) { -+ prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", -+ le32_to_cpu(f->type)); -+ return -BCH_ERR_invalid_sb_field_size; -+ } -+ -+ if (vstruct_next(f) > vstruct_last(sb)) { -+ prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", -+ le32_to_cpu(f->type)); -+ return -BCH_ERR_invalid_sb_field_size; -+ } -+ } -+ -+ /* members must be validated first: */ -+ mi = bch2_sb_get_members(sb); -+ if (!mi) { -+ prt_printf(out, "Invalid superblock: member info area missing"); -+ return -BCH_ERR_invalid_sb_members_missing; -+ } -+ -+ ret = bch2_sb_field_validate(sb, &mi->field, out); -+ if (ret) -+ return ret; -+ -+ vstruct_for_each(sb, f) { -+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) -+ continue; -+ -+ ret = bch2_sb_field_validate(sb, f, out); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* device open: */ -+ -+static void bch2_sb_update(struct bch_fs *c) -+{ -+ struct bch_sb *src = c->disk_sb.sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(src); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ c->sb.uuid = src->uuid; -+ c->sb.user_uuid = src->user_uuid; -+ c->sb.version = le16_to_cpu(src->version); -+ c->sb.version_min = le16_to_cpu(src->version_min); -+ c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src); -+ c->sb.nr_devices = src->nr_devices; -+ c->sb.clean = BCH_SB_CLEAN(src); -+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); -+ -+ c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); -+ c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; -+ -+ /* XXX this is wrong, we need a 96 or 128 bit integer type */ -+ c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), -+ c->sb.nsec_per_time_unit); -+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); -+ -+ c->sb.features = le64_to_cpu(src->features[0]); -+ c->sb.compat = le64_to_cpu(src->compat[0]); -+ -+ for_each_member_device(ca, c, i) -+ ca->mi = bch2_mi_to_cpu(mi->members + i); -+} -+ -+static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -+{ -+ struct bch_sb_field *src_f, *dst_f; -+ struct bch_sb *dst = dst_handle->sb; -+ unsigned i; -+ -+ dst->version = src->version; -+ dst->version_min = src->version_min; -+ dst->seq = src->seq; -+ dst->uuid = src->uuid; -+ dst->user_uuid = src->user_uuid; -+ memcpy(dst->label, src->label, sizeof(dst->label)); -+ -+ dst->block_size = src->block_size; -+ dst->nr_devices = src->nr_devices; -+ -+ dst->time_base_lo = src->time_base_lo; -+ dst->time_base_hi = src->time_base_hi; -+ dst->time_precision = src->time_precision; -+ -+ memcpy(dst->flags, src->flags, sizeof(dst->flags)); -+ memcpy(dst->features, src->features, sizeof(dst->features)); -+ memcpy(dst->compat, src->compat, sizeof(dst->compat)); -+ -+ for (i = 0; i < BCH_SB_FIELD_NR; i++) { -+ int d; -+ -+ if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) -+ continue; -+ -+ src_f = bch2_sb_field_get(src, i); -+ dst_f = bch2_sb_field_get(dst, i); -+ -+ d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - -+ (dst_f ? le32_to_cpu(dst_f->u64s) : 0); -+ if (d > 0) { -+ int ret = bch2_sb_realloc(dst_handle, -+ le32_to_cpu(dst_handle->sb->u64s) + d); -+ -+ if (ret) -+ return ret; -+ -+ dst = dst_handle->sb; -+ dst_f = bch2_sb_field_get(dst, i); -+ } -+ -+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, -+ src_f ? le32_to_cpu(src_f->u64s) : 0); -+ -+ if (src_f) -+ memcpy(dst_f, src_f, vstruct_bytes(src_f)); -+ } -+ -+ return 0; -+} -+ -+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -+{ -+ int ret; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ ret = bch2_sb_realloc(&c->disk_sb, 0) ?: -+ __copy_super(&c->disk_sb, src) ?: -+ bch2_sb_replicas_to_cpu_replicas(c) ?: -+ bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ return ret; -+ -+ bch2_sb_update(c); -+ return 0; -+} -+ -+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -+{ -+ return __copy_super(&ca->disk_sb, c->disk_sb.sb); -+} -+ -+/* read superblock: */ -+ -+static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) -+{ -+ struct bch_csum csum; -+ size_t bytes; -+ int ret; -+reread: -+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); -+ sb->bio->bi_iter.bi_sector = offset; -+ bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); -+ -+ ret = submit_bio_wait(sb->bio); -+ if (ret) { -+ prt_printf(err, "IO error: %i", ret); -+ return ret; -+ } -+ -+ if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && -+ !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { -+ prt_printf(err, "Not a bcachefs superblock"); -+ return -BCH_ERR_invalid_sb_magic; -+ } -+ -+ ret = bch2_sb_compatible(sb->sb, err); -+ if (ret) -+ return ret; -+ -+ bytes = vstruct_bytes(sb->sb); -+ -+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { -+ prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", -+ bytes, 512UL << sb->sb->layout.sb_max_size_bits); -+ return -BCH_ERR_invalid_sb_too_big; -+ } -+ -+ if (bytes > sb->buffer_size) { -+ ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); -+ if (ret) -+ return ret; -+ goto reread; -+ } -+ -+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { -+ prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); -+ return -BCH_ERR_invalid_sb_csum_type; -+ } -+ -+ /* XXX: verify MACs */ -+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), -+ null_nonce(), sb->sb); -+ -+ if (bch2_crc_cmp(csum, sb->sb->csum)) { -+ prt_printf(err, "bad checksum"); -+ return -BCH_ERR_invalid_sb_csum; -+ } -+ -+ sb->seq = le64_to_cpu(sb->sb->seq); -+ -+ return 0; -+} -+ -+int bch2_read_super(const char *path, struct bch_opts *opts, -+ struct bch_sb_handle *sb) -+{ -+ u64 offset = opt_get(*opts, sb); -+ struct bch_sb_layout layout; -+ struct printbuf err = PRINTBUF; -+ __le64 *i; -+ int ret; -+#ifndef __KERNEL__ -+retry: -+#endif -+ memset(sb, 0, sizeof(*sb)); -+ sb->mode = BLK_OPEN_READ; -+ sb->have_bio = true; -+ sb->holder = kmalloc(1, GFP_KERNEL); -+ if (!sb->holder) -+ return -ENOMEM; -+ -+#ifndef __KERNEL__ -+ if (opt_get(*opts, direct_io) == false) -+ sb->mode |= FMODE_BUFFERED; -+#endif -+ -+ if (!opt_get(*opts, noexcl)) -+ sb->mode |= BLK_OPEN_EXCL; -+ -+ if (!opt_get(*opts, nochanges)) -+ sb->mode |= BLK_OPEN_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); -+ if (IS_ERR(sb->bdev) && -+ PTR_ERR(sb->bdev) == -EACCES && -+ opt_get(*opts, read_only)) { -+ sb->mode &= ~BLK_OPEN_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops); -+ if (!IS_ERR(sb->bdev)) -+ opt_set(*opts, nochanges, true); -+ } -+ -+ if (IS_ERR(sb->bdev)) { -+ ret = PTR_ERR(sb->bdev); -+ goto out; -+ } -+ -+ ret = bch2_sb_realloc(sb, 0); -+ if (ret) { -+ prt_printf(&err, "error allocating memory for superblock"); -+ goto err; -+ } -+ -+ if (bch2_fs_init_fault("read_super")) { -+ prt_printf(&err, "dynamic fault"); -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ ret = read_one_super(sb, offset, &err); -+ if (!ret) -+ goto got_super; -+ -+ if (opt_defined(*opts, sb)) -+ goto err; -+ -+ printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", -+ path, err.buf); -+ printbuf_reset(&err); -+ -+ /* -+ * Error reading primary superblock - read location of backup -+ * superblocks: -+ */ -+ bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); -+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; -+ /* -+ * use sb buffer to read layout, since sb buffer is page aligned but -+ * layout won't be: -+ */ -+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); -+ -+ ret = submit_bio_wait(sb->bio); -+ if (ret) { -+ prt_printf(&err, "IO error: %i", ret); -+ goto err; -+ } -+ -+ memcpy(&layout, sb->sb, sizeof(layout)); -+ ret = validate_sb_layout(&layout, &err); -+ if (ret) -+ goto err; -+ -+ for (i = layout.sb_offset; -+ i < layout.sb_offset + layout.nr_superblocks; i++) { -+ offset = le64_to_cpu(*i); -+ -+ if (offset == opt_get(*opts, sb)) -+ continue; -+ -+ ret = read_one_super(sb, offset, &err); -+ if (!ret) -+ goto got_super; -+ } -+ -+ goto err; -+ -+got_super: -+ if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev) && -+ opt_get(*opts, direct_io)) { -+#ifndef __KERNEL__ -+ opt_set(*opts, direct_io, false); -+ bch2_free_super(sb); -+ goto retry; -+#endif -+ prt_printf(&err, "block size (%u) smaller than device block size (%u)", -+ le16_to_cpu(sb->sb->block_size) << 9, -+ bdev_logical_block_size(sb->bdev)); -+ ret = -BCH_ERR_block_size_too_small; -+ goto err; -+ } -+ -+ ret = 0; -+ sb->have_layout = true; -+ -+ ret = bch2_sb_validate(sb, &err, READ); -+ if (ret) { -+ printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", -+ path, err.buf); -+ goto err_no_print; -+ } -+out: -+ printbuf_exit(&err); -+ return ret; -+err: -+ printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", -+ path, err.buf); -+err_no_print: -+ bch2_free_super(sb); -+ goto out; -+} -+ -+/* write superblock: */ -+ -+static void write_super_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ -+ /* XXX: return errors directly */ -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", -+ bch2_blk_status_to_str(bio->bi_status))) -+ ca->sb_write_error = 1; -+ -+ closure_put(&ca->fs->sb_write); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ sb->offset = sb->layout.sb_offset[idx]; -+ -+ SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); -+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), -+ null_nonce(), sb); -+ -+ bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bch2_bio_map(bio, sb, -+ roundup((size_t) vstruct_bytes(sb), -+ bdev_logical_block_size(ca->disk_sb.bdev))); -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+int bch2_write_super(struct bch_fs *c) -+{ -+ struct closure *cl = &c->sb_write; -+ struct bch_dev *ca; -+ struct printbuf err = PRINTBUF; -+ unsigned i, sb = 0, nr_wrote; -+ struct bch_devs_mask sb_written; -+ bool wrote, can_mount_without_written, can_mount_with_written; -+ unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; -+ int ret = 0; -+ -+ trace_and_count(c, write_super, c, _RET_IP_); -+ -+ if (c->opts.very_degraded) -+ degraded_flags |= BCH_FORCE_IF_LOST; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ closure_init_stack(cl); -+ memset(&sb_written, 0, sizeof(sb_written)); -+ -+ /* Make sure we're using the new magic numbers: */ -+ c->disk_sb.sb->magic = BCHFS_MAGIC; -+ c->disk_sb.sb->layout.magic = BCHFS_MAGIC; -+ -+ le64_add_cpu(&c->disk_sb.sb->seq, 1); -+ -+ if (test_bit(BCH_FS_ERROR, &c->flags)) -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); -+ if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) -+ SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); -+ -+ SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); -+ -+ bch2_sb_counters_from_cpu(c); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ for_each_online_member(ca, c, i) { -+ printbuf_reset(&err); -+ -+ ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); -+ if (ret) { -+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); -+ percpu_ref_put(&ca->io_ref); -+ goto out; -+ } -+ } -+ -+ if (c->opts.nochanges) -+ goto out; -+ -+ /* -+ * Defer writing the superblock until filesystem initialization is -+ * complete - don't write out a partly initialized superblock: -+ */ -+ if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) -+ goto out; -+ -+ for_each_online_member(ca, c, i) { -+ __set_bit(ca->dev_idx, sb_written.d); -+ ca->sb_write_error = 0; -+ } -+ -+ for_each_online_member(ca, c, i) -+ read_back_super(c, ca); -+ closure_sync(cl); -+ -+ for_each_online_member(ca, c, i) { -+ if (ca->sb_write_error) -+ continue; -+ -+ if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { -+ bch2_fs_fatal_error(c, -+ "Superblock write was silently dropped! (seq %llu expected %llu)", -+ le64_to_cpu(ca->sb_read_scratch->seq), -+ ca->disk_sb.seq); -+ percpu_ref_put(&ca->io_ref); -+ ret = -BCH_ERR_erofs_sb_err; -+ goto out; -+ } -+ -+ if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { -+ bch2_fs_fatal_error(c, -+ "Superblock modified by another process (seq %llu expected %llu)", -+ le64_to_cpu(ca->sb_read_scratch->seq), -+ ca->disk_sb.seq); -+ percpu_ref_put(&ca->io_ref); -+ ret = -BCH_ERR_erofs_sb_err; -+ goto out; -+ } -+ } -+ -+ do { -+ wrote = false; -+ for_each_online_member(ca, c, i) -+ if (!ca->sb_write_error && -+ sb < ca->disk_sb.sb->layout.nr_superblocks) { -+ write_one_super(c, ca, sb); -+ wrote = true; -+ } -+ closure_sync(cl); -+ sb++; -+ } while (wrote); -+ -+ for_each_online_member(ca, c, i) { -+ if (ca->sb_write_error) -+ __clear_bit(ca->dev_idx, sb_written.d); -+ else -+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); -+ } -+ -+ nr_wrote = dev_mask_nr(&sb_written); -+ -+ can_mount_with_written = -+ bch2_have_enough_devs(c, sb_written, degraded_flags, false); -+ -+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) -+ sb_written.d[i] = ~sb_written.d[i]; -+ -+ can_mount_without_written = -+ bch2_have_enough_devs(c, sb_written, degraded_flags, false); -+ -+ /* -+ * If we would be able to mount _without_ the devices we successfully -+ * wrote superblocks to, we weren't able to write to enough devices: -+ * -+ * Exception: if we can mount without the successes because we haven't -+ * written anything (new filesystem), we continue if we'd be able to -+ * mount with the devices we did successfully write to: -+ */ -+ if (bch2_fs_fatal_err_on(!nr_wrote || -+ !can_mount_with_written || -+ (can_mount_without_written && -+ !can_mount_with_written), c, -+ "Unable to write superblock to sufficient devices (from %ps)", -+ (void *) _RET_IP_)) -+ ret = -1; -+out: -+ /* Make new options visible after they're persistent: */ -+ bch2_sb_update(c); -+ printbuf_exit(&err); -+ return ret; -+} -+ -+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ mutex_lock(&c->sb_lock); -+ if (!(c->sb.features & (1ULL << feat))) { -+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); -+ -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+} -+ -+/* Downgrade if superblock is at a higher version than currently supported: */ -+void bch2_sb_maybe_downgrade(struct bch_fs *c) -+{ -+ lockdep_assert_held(&c->sb_lock); -+ -+ /* -+ * Downgrade, if superblock is at a higher version than currently -+ * supported: -+ */ -+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) -+ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); -+ if (c->sb.version > bcachefs_metadata_version_current) -+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); -+ if (c->sb.version_min > bcachefs_metadata_version_current) -+ c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); -+ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); -+} -+ -+void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) -+{ -+ lockdep_assert_held(&c->sb_lock); -+ -+ c->disk_sb.sb->version = cpu_to_le16(new_version); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); -+} -+ -+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -+#define x(f, nr) \ -+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, -+ BCH_SB_FIELDS() -+#undef x -+}; -+ -+static const struct bch_sb_field_ops bch2_sb_field_null_ops; -+ -+static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) -+{ -+ return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) -+ ? bch2_sb_field_ops[type] -+ : &bch2_sb_field_null_ops; -+} -+ -+static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, -+ struct printbuf *err) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ struct printbuf field_err = PRINTBUF; -+ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); -+ int ret; -+ -+ ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; -+ if (ret) { -+ prt_printf(err, "Invalid superblock section %s: %s", -+ bch2_sb_fields[type], field_err.buf); -+ prt_newline(err); -+ bch2_sb_field_to_text(err, sb, f); -+ } -+ -+ printbuf_exit(&field_err); -+ return ret; -+} -+ -+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); -+ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 32); -+ -+ if (type < BCH_SB_FIELD_NR) -+ prt_printf(out, "%s", bch2_sb_fields[type]); -+ else -+ prt_printf(out, "(unknown field %u)", type); -+ -+ prt_printf(out, " (size %zu):", vstruct_bytes(f)); -+ prt_newline(out); -+ -+ if (ops->to_text) { -+ printbuf_indent_add(out, 2); -+ ops->to_text(out, sb, f); -+ printbuf_indent_sub(out, 2); -+ } -+} -+ -+void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) -+{ -+ unsigned i; -+ -+ prt_printf(out, "Type: %u", l->layout_type); -+ prt_newline(out); -+ -+ prt_str(out, "Superblock max size: "); -+ prt_units_u64(out, 512 << l->sb_max_size_bits); -+ prt_newline(out); -+ -+ prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); -+ prt_newline(out); -+ -+ prt_str(out, "Offsets: "); -+ for (i = 0; i < l->nr_superblocks; i++) { -+ if (i) -+ prt_str(out, ", "); -+ prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); -+ } -+ prt_newline(out); -+} -+ -+void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, -+ bool print_layout, unsigned fields) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_sb_field *f; -+ u64 fields_have = 0; -+ unsigned nr_devices = 0; -+ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 44); -+ -+ mi = bch2_sb_get_members(sb); -+ if (mi) { -+ struct bch_member *m; -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) -+ nr_devices += bch2_member_exists(m); -+ } -+ -+ prt_printf(out, "External UUID:"); -+ prt_tab(out); -+ pr_uuid(out, sb->user_uuid.b); -+ prt_newline(out); -+ -+ prt_printf(out, "Internal UUID:"); -+ prt_tab(out); -+ pr_uuid(out, sb->uuid.b); -+ prt_newline(out); -+ -+ prt_str(out, "Device index:"); -+ prt_tab(out); -+ prt_printf(out, "%u", sb->dev_idx); -+ prt_newline(out); -+ -+ prt_str(out, "Label:"); -+ prt_tab(out); -+ prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); -+ prt_newline(out); -+ -+ prt_str(out, "Version:"); -+ prt_tab(out); -+ bch2_version_to_text(out, le16_to_cpu(sb->version)); -+ prt_newline(out); -+ -+ prt_str(out, "Version upgrade complete:"); -+ prt_tab(out); -+ bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); -+ prt_newline(out); -+ -+ prt_printf(out, "Oldest version on disk:"); -+ prt_tab(out); -+ bch2_version_to_text(out, le16_to_cpu(sb->version_min)); -+ prt_newline(out); -+ -+ prt_printf(out, "Created:"); -+ prt_tab(out); -+ if (sb->time_base_lo) -+ pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); -+ else -+ prt_printf(out, "(not set)"); -+ prt_newline(out); -+ -+ prt_printf(out, "Sequence number:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", le64_to_cpu(sb->seq)); -+ prt_newline(out); -+ -+ prt_printf(out, "Superblock size:"); -+ prt_tab(out); -+ prt_printf(out, "%zu", vstruct_bytes(sb)); -+ prt_newline(out); -+ -+ prt_printf(out, "Clean:"); -+ prt_tab(out); -+ prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); -+ prt_newline(out); -+ -+ prt_printf(out, "Devices:"); -+ prt_tab(out); -+ prt_printf(out, "%u", nr_devices); -+ prt_newline(out); -+ -+ prt_printf(out, "Sections:"); -+ vstruct_for_each(sb, f) -+ fields_have |= 1 << le32_to_cpu(f->type); -+ prt_tab(out); -+ prt_bitflags(out, bch2_sb_fields, fields_have); -+ prt_newline(out); -+ -+ prt_printf(out, "Features:"); -+ prt_tab(out); -+ prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); -+ prt_newline(out); -+ -+ prt_printf(out, "Compat features:"); -+ prt_tab(out); -+ prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); -+ prt_newline(out); -+ -+ prt_newline(out); -+ prt_printf(out, "Options:"); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ { -+ enum bch_opt_id id; -+ -+ for (id = 0; id < bch2_opts_nr; id++) { -+ const struct bch_option *opt = bch2_opt_table + id; -+ -+ if (opt->get_sb != BCH2_NO_SB_OPT) { -+ u64 v = bch2_opt_from_sb(sb, id); -+ -+ prt_printf(out, "%s:", opt->attr.name); -+ prt_tab(out); -+ bch2_opt_to_text(out, NULL, sb, opt, v, -+ OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); -+ prt_newline(out); -+ } -+ } -+ } -+ -+ printbuf_indent_sub(out, 2); -+ -+ if (print_layout) { -+ prt_newline(out); -+ prt_printf(out, "layout:"); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ bch2_sb_layout_to_text(out, &sb->layout); -+ printbuf_indent_sub(out, 2); -+ } -+ -+ vstruct_for_each(sb, f) -+ if (fields & (1 << le32_to_cpu(f->type))) { -+ prt_newline(out); -+ bch2_sb_field_to_text(out, sb, f); -+ } -+} -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -new file mode 100644 -index 000000000..d51c0a195 ---- /dev/null -+++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,133 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_IO_H -+#define _BCACHEFS_SUPER_IO_H -+ -+#include "extents.h" -+#include "eytzinger.h" -+#include "super_types.h" -+#include "super.h" -+ -+#include -+ -+static inline bool bch2_version_compatible(u16 version) -+{ -+ return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && -+ version >= bcachefs_metadata_version_min; -+} -+ -+void bch2_version_to_text(struct printbuf *, unsigned); -+unsigned bch2_latest_compatible_version(unsigned); -+ -+u64 bch2_upgrade_recovery_passes(struct bch_fs *c, -+ unsigned, -+ unsigned); -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, -+ enum bch_sb_field_type, unsigned); -+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); -+ -+#define field_to_type(_f, _name) \ -+ container_of_or_null(_f, struct bch_sb_field_##_name, field) -+ -+#define x(_name, _nr) \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_get_##_name(struct bch_sb *sb) \ -+{ \ -+ return field_to_type(bch2_sb_field_get(sb, \ -+ BCH_SB_FIELD_##_name), _name); \ -+} \ -+ \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ -+{ \ -+ return field_to_type(bch2_sb_field_resize(sb, \ -+ BCH_SB_FIELD_##_name, u64s), _name); \ -+} -+ -+BCH_SB_FIELDS() -+#undef x -+ -+extern const char * const bch2_sb_fields[]; -+ -+struct bch_sb_field_ops { -+ int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); -+ void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); -+}; -+ -+static inline __le64 bch2_sb_magic(struct bch_fs *c) -+{ -+ __le64 ret; -+ -+ memcpy(&ret, &c->sb.uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 jset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -+} -+ -+static inline __u64 bset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -+} -+ -+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); -+ -+void bch2_free_super(struct bch_sb_handle *); -+int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -+ -+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -+int bch2_write_super(struct bch_fs *); -+void __bch2_check_set_feature(struct bch_fs *, unsigned); -+ -+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ if (!(c->sb.features & (1ULL << feat))) -+ __bch2_check_set_feature(c, feat); -+} -+ -+/* BCH_SB_FIELD_members: */ -+ -+static inline bool bch2_member_exists(struct bch_member *m) -+{ -+ return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); -+} -+ -+static inline bool bch2_dev_exists(struct bch_sb *sb, -+ struct bch_sb_field_members *mi, -+ unsigned dev) -+{ -+ return dev < sb->nr_devices && -+ bch2_member_exists(&mi->members[dev]); -+} -+ -+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -+{ -+ return (struct bch_member_cpu) { -+ .nbuckets = le64_to_cpu(mi->nbuckets), -+ .first_bucket = le16_to_cpu(mi->first_bucket), -+ .bucket_size = le16_to_cpu(mi->bucket_size), -+ .group = BCH_MEMBER_GROUP(mi), -+ .state = BCH_MEMBER_STATE(mi), -+ .discard = BCH_MEMBER_DISCARD(mi), -+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), -+ .durability = BCH_MEMBER_DURABILITY(mi) -+ ? BCH_MEMBER_DURABILITY(mi) - 1 -+ : 1, -+ .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), -+ .valid = bch2_member_exists(mi), -+ }; -+} -+ -+void bch2_sb_maybe_downgrade(struct bch_fs *); -+void bch2_sb_upgrade(struct bch_fs *, unsigned); -+ -+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); -+void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); -+ -+#endif /* _BCACHEFS_SUPER_IO_H */ -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -new file mode 100644 -index 000000000..604248659 ---- /dev/null -+++ b/fs/bcachefs/super.c -@@ -0,0 +1,2015 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs setup/teardown code, and some metadata io - read a superblock and -+ * figure out what to do with it. -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_journal_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_write_buffer.h" -+#include "buckets_waiting_for_journal.h" -+#include "chardev.h" -+#include "checksum.h" -+#include "clock.h" -+#include "compress.h" -+#include "counters.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "errcode.h" -+#include "error.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fs-io-buffered.h" -+#include "fs-io-direct.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "move.h" -+#include "migrate.h" -+#include "movinggc.h" -+#include "nocow_locking.h" -+#include "quota.h" -+#include "rebalance.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "sb-clean.h" -+#include "snapshot.h" -+#include "subvolume.h" -+#include "super.h" -+#include "super-io.h" -+#include "sysfs.h" -+#include "trace.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Kent Overstreet "); -+ -+#define KTYPE(type) \ -+static const struct attribute_group type ## _group = { \ -+ .attrs = type ## _files \ -+}; \ -+ \ -+static const struct attribute_group *type ## _groups[] = { \ -+ &type ## _group, \ -+ NULL \ -+}; \ -+ \ -+static const struct kobj_type type ## _ktype = { \ -+ .release = type ## _release, \ -+ .sysfs_ops = &type ## _sysfs_ops, \ -+ .default_groups = type ## _groups \ -+} -+ -+static void bch2_fs_release(struct kobject *); -+static void bch2_dev_release(struct kobject *); -+static void bch2_fs_counters_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_internal_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_opts_dir_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_time_stats_release(struct kobject *k) -+{ -+} -+ -+KTYPE(bch2_fs); -+KTYPE(bch2_fs_counters); -+KTYPE(bch2_fs_internal); -+KTYPE(bch2_fs_opts_dir); -+KTYPE(bch2_fs_time_stats); -+KTYPE(bch2_dev); -+ -+static struct kset *bcachefs_kset; -+static LIST_HEAD(bch_fs_list); -+static DEFINE_MUTEX(bch_fs_list_lock); -+ -+DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); -+ -+static void bch2_dev_free(struct bch_dev *); -+static int bch2_dev_alloc(struct bch_fs *, unsigned); -+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -+ -+struct bch_fs *bch2_dev_to_fs(dev_t dev) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&bch_fs_list_lock); -+ rcu_read_lock(); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ for_each_member_device_rcu(ca, c, i, NULL) -+ if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { -+ closure_get(&c->cl); -+ goto found; -+ } -+ c = NULL; -+found: -+ rcu_read_unlock(); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) -+{ -+ struct bch_fs *c; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) -+ return c; -+ -+ return NULL; -+} -+ -+struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) -+{ -+ struct bch_fs *c; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(uuid); -+ if (c) -+ closure_get(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+static void bch2_dev_usage_journal_reserve(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i, nr = 0, u64s = -+ ((sizeof(struct jset_entry_dev_usage) + -+ sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / -+ sizeof(u64); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, NULL) -+ nr++; -+ rcu_read_unlock(); -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->dev_usage_journal_res, u64s * nr); -+} -+ -+/* Filesystem RO/RW: */ -+ -+/* -+ * For startup/shutdown of RW stuff, the dependencies are: -+ * -+ * - foreground writes depend on copygc and rebalance (to free up space) -+ * -+ * - copygc and rebalance depend on mark and sweep gc (they actually probably -+ * don't because they either reserve ahead of time or don't block if -+ * allocations fail, but allocations can require mark and sweep gc to run -+ * because of generation number wraparound) -+ * -+ * - all of the above depends on the allocator threads -+ * -+ * - allocator depends on the journal (when it rewrites prios and gens) -+ */ -+ -+static void __bch2_fs_read_only(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i, clean_passes = 0; -+ u64 seq = 0; -+ -+ bch2_fs_ec_stop(c); -+ bch2_open_buckets_stop(c, NULL, true); -+ bch2_rebalance_stop(c); -+ bch2_copygc_stop(c); -+ bch2_gc_thread_stop(c); -+ bch2_fs_ec_flush(c); -+ -+ bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", -+ journal_cur_seq(&c->journal)); -+ -+ do { -+ clean_passes++; -+ -+ if (bch2_btree_interior_updates_flush(c) || -+ bch2_journal_flush_all_pins(&c->journal) || -+ bch2_btree_flush_all_writes(c) || -+ seq != atomic64_read(&c->journal.seq)) { -+ seq = atomic64_read(&c->journal.seq); -+ clean_passes = 0; -+ } -+ } while (clean_passes < 2); -+ -+ bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", -+ journal_cur_seq(&c->journal)); -+ -+ if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); -+ bch2_fs_journal_stop(&c->journal); -+ -+ /* -+ * After stopping journal: -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_remove(c, ca); -+} -+ -+#ifndef BCH_WRITE_REF_DEBUG -+static void bch2_writes_disabled(struct percpu_ref *writes) -+{ -+ struct bch_fs *c = container_of(writes, struct bch_fs, writes); -+ -+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ wake_up(&bch2_read_only_wait); -+} -+#endif -+ -+void bch2_fs_read_only(struct bch_fs *c) -+{ -+ if (!test_bit(BCH_FS_RW, &c->flags)) { -+ bch2_journal_reclaim_stop(&c->journal); -+ return; -+ } -+ -+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ /* -+ * Block new foreground-end write operations from starting - any new -+ * writes will return -EROFS: -+ */ -+ set_bit(BCH_FS_GOING_RO, &c->flags); -+#ifndef BCH_WRITE_REF_DEBUG -+ percpu_ref_kill(&c->writes); -+#else -+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) -+ bch2_write_ref_put(c, i); -+#endif -+ -+ /* -+ * If we're not doing an emergency shutdown, we want to wait on -+ * outstanding writes to complete so they don't see spurious errors due -+ * to shutting down the allocator: -+ * -+ * If we are doing an emergency shutdown outstanding writes may -+ * hang until we shutdown the allocator so we don't want to wait -+ * on outstanding writes before shutting everything down - but -+ * we do need to wait on them before returning and signalling -+ * that going RO is complete: -+ */ -+ wait_event(bch2_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || -+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); -+ -+ __bch2_fs_read_only(c); -+ -+ wait_event(bch2_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ clear_bit(BCH_FS_GOING_RO, &c->flags); -+ -+ if (!bch2_journal_error(&c->journal) && -+ !test_bit(BCH_FS_ERROR, &c->flags) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && -+ test_bit(BCH_FS_STARTED, &c->flags) && -+ test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && -+ !c->opts.norecovery) { -+ BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); -+ BUG_ON(atomic_read(&c->btree_cache.dirty)); -+ BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); -+ BUG_ON(c->btree_write_buffer.state.nr); -+ -+ bch_verbose(c, "marking filesystem clean"); -+ bch2_fs_mark_clean(c); -+ } -+ -+ clear_bit(BCH_FS_RW, &c->flags); -+} -+ -+static void bch2_fs_read_only_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, read_only_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+} -+ -+static void bch2_fs_read_only_async(struct bch_fs *c) -+{ -+ queue_work(system_long_wq, &c->read_only_work); -+} -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); -+ -+ bch2_journal_halt(&c->journal); -+ bch2_fs_read_only_async(c); -+ -+ wake_up(&bch2_read_only_wait); -+ return ret; -+} -+ -+static int bch2_fs_read_write_late(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Data move operations can't run until after check_snapshots has -+ * completed, and bch2_snapshot_is_ancestor() is available. -+ * -+ * Ideally we'd start copygc/rebalance earlier instead of waiting for -+ * all of recovery/fsck to complete: -+ */ -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err(c, "error starting copygc thread"); -+ return ret; -+ } -+ -+ ret = bch2_rebalance_start(c); -+ if (ret) { -+ bch_err(c, "error starting rebalance thread"); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int __bch2_fs_read_write(struct bch_fs *c, bool early) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { -+ bch_err(c, "cannot go rw, unfixed btree errors"); -+ return -BCH_ERR_erofs_unfixed_errors; -+ } -+ -+ if (test_bit(BCH_FS_RW, &c->flags)) -+ return 0; -+ -+ if (c->opts.norecovery) -+ return -BCH_ERR_erofs_norecovery; -+ -+ /* -+ * nochanges is used for fsck -n mode - we have to allow going rw -+ * during recovery for that to work: -+ */ -+ if (c->opts.nochanges && (!early || c->opts.read_only)) -+ return -BCH_ERR_erofs_nochanges; -+ -+ bch_info(c, "going read-write"); -+ -+ ret = bch2_fs_mark_dirty(c); -+ if (ret) -+ goto err; -+ -+ clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); -+ -+ /* -+ * First journal write must be a flush write: after a clean shutdown we -+ * don't read the journal, so the first journal write may end up -+ * overwriting whatever was there previously, and there must always be -+ * at least one non-flush write in the journal or recovery will fail: -+ */ -+ set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ ret = bch2_gc_thread_start(c); -+ if (ret) { -+ bch_err(c, "error starting gc thread"); -+ return ret; -+ } -+ -+ if (!early) { -+ ret = bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+#ifndef BCH_WRITE_REF_DEBUG -+ percpu_ref_reinit(&c->writes); -+#else -+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { -+ BUG_ON(atomic_long_read(&c->writes[i])); -+ atomic_long_inc(&c->writes[i]); -+ } -+#endif -+ set_bit(BCH_FS_RW, &c->flags); -+ set_bit(BCH_FS_WAS_RW, &c->flags); -+ -+ bch2_do_discards(c); -+ bch2_do_invalidates(c); -+ bch2_do_stripe_deletes(c); -+ bch2_do_pending_node_rewrites(c); -+ return 0; -+err: -+ __bch2_fs_read_only(c); -+ return ret; -+} -+ -+int bch2_fs_read_write(struct bch_fs *c) -+{ -+ return __bch2_fs_read_write(c, false); -+} -+ -+int bch2_fs_read_write_early(struct bch_fs *c) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ return __bch2_fs_read_write(c, true); -+} -+ -+/* Filesystem startup/shutdown: */ -+ -+static void __bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ int cpu; -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_exit(&c->times[i]); -+ -+ bch2_free_pending_node_rewrites(c); -+ bch2_fs_counters_exit(c); -+ bch2_fs_snapshots_exit(c); -+ bch2_fs_quota_exit(c); -+ bch2_fs_fs_io_direct_exit(c); -+ bch2_fs_fs_io_buffered_exit(c); -+ bch2_fs_fsio_exit(c); -+ bch2_fs_ec_exit(c); -+ bch2_fs_encryption_exit(c); -+ bch2_fs_io_exit(c); -+ bch2_fs_buckets_waiting_for_journal_exit(c); -+ bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_iter_exit(c); -+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); -+ bch2_fs_btree_cache_exit(c); -+ bch2_fs_replicas_exit(c); -+ bch2_fs_journal_exit(&c->journal); -+ bch2_io_clock_exit(&c->io_clock[WRITE]); -+ bch2_io_clock_exit(&c->io_clock[READ]); -+ bch2_fs_compress_exit(c); -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(c); -+ bch2_fs_btree_write_buffer_exit(c); -+ percpu_free_rwsem(&c->mark_lock); -+ free_percpu(c->online_reserved); -+ -+ if (c->btree_paths_bufs) -+ for_each_possible_cpu(cpu) -+ kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); -+ -+ darray_exit(&c->btree_roots_extra); -+ free_percpu(c->btree_paths_bufs); -+ free_percpu(c->pcpu); -+ mempool_exit(&c->large_bkey_pool); -+ mempool_exit(&c->btree_bounce_pool); -+ bioset_exit(&c->btree_bio); -+ mempool_exit(&c->fill_iter); -+#ifndef BCH_WRITE_REF_DEBUG -+ percpu_ref_exit(&c->writes); -+#endif -+ kfree(rcu_dereference_protected(c->disk_groups, 1)); -+ kfree(c->journal_seq_blacklist_table); -+ kfree(c->unused_inode_hints); -+ -+ if (c->write_ref_wq) -+ destroy_workqueue(c->write_ref_wq); -+ if (c->io_complete_wq) -+ destroy_workqueue(c->io_complete_wq); -+ if (c->copygc_wq) -+ destroy_workqueue(c->copygc_wq); -+ if (c->btree_io_complete_wq) -+ destroy_workqueue(c->btree_io_complete_wq); -+ if (c->btree_update_wq) -+ destroy_workqueue(c->btree_update_wq); -+ -+ bch2_free_super(&c->disk_sb); -+ kvpfree(c, sizeof(*c)); -+ module_put(THIS_MODULE); -+} -+ -+static void bch2_fs_release(struct kobject *kobj) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ __bch2_fs_free(c); -+} -+ -+void __bch2_fs_stop(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ bch_verbose(c, "shutting down"); -+ -+ set_bit(BCH_FS_STOPPING, &c->flags); -+ -+ cancel_work_sync(&c->journal_seq_blacklist_gc_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); -+ -+ if (c->kobj.state_in_sysfs) -+ kobject_del(&c->kobj); -+ -+ bch2_fs_debug_exit(c); -+ bch2_fs_chardev_exit(c); -+ -+ kobject_put(&c->counters_kobj); -+ kobject_put(&c->time_stats); -+ kobject_put(&c->opts_dir); -+ kobject_put(&c->internal); -+ -+ /* btree prefetch might have kicked off reads in the background: */ -+ bch2_btree_flush_all_reads(c); -+ -+ for_each_member_device(ca, c, i) -+ cancel_work_sync(&ca->io_error_work); -+ -+ cancel_work_sync(&c->read_only_work); -+} -+ -+void bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ BUG_ON(!test_bit(BCH_FS_STOPPING, &c->flags)); -+ -+ mutex_lock(&bch_fs_list_lock); -+ list_del(&c->list); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ closure_sync(&c->cl); -+ closure_debug_destroy(&c->cl); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) { -+ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); -+ -+ if (ca) { -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_free(ca); -+ } -+ } -+ -+ bch_verbose(c, "shutdown complete"); -+ -+ kobject_put(&c->kobj); -+} -+ -+void bch2_fs_stop(struct bch_fs *c) -+{ -+ __bch2_fs_stop(c); -+ bch2_fs_free(c); -+} -+ -+static int bch2_fs_online(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ if (__bch2_uuid_to_fs(c->sb.uuid)) { -+ bch_err(c, "filesystem UUID already open"); -+ return -EINVAL; -+ } -+ -+ ret = bch2_fs_chardev_init(c); -+ if (ret) { -+ bch_err(c, "error creating character device"); -+ return ret; -+ } -+ -+ bch2_fs_debug_init(c); -+ -+ ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: -+ kobject_add(&c->internal, &c->kobj, "internal") ?: -+ kobject_add(&c->opts_dir, &c->kobj, "options") ?: -+ kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: -+ kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: -+ bch2_opts_create_sysfs_files(&c->opts_dir); -+ if (ret) { -+ bch_err(c, "error creating sysfs objects"); -+ return ret; -+ } -+ -+ down_write(&c->state_lock); -+ -+ for_each_member_device(ca, c, i) { -+ ret = bch2_dev_sysfs_online(c, ca); -+ if (ret) { -+ bch_err(c, "error creating sysfs objects"); -+ percpu_ref_put(&ca->ref); -+ goto err; -+ } -+ } -+ -+ BUG_ON(!list_empty(&c->list)); -+ list_add(&c->list, &bch_fs_list); -+err: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_fs *c; -+ struct printbuf name = PRINTBUF; -+ unsigned i, iter_size; -+ int ret = 0; -+ -+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); -+ if (!c) { -+ c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); -+ goto out; -+ } -+ -+ __module_get(THIS_MODULE); -+ -+ closure_init(&c->cl, NULL); -+ -+ c->kobj.kset = bcachefs_kset; -+ kobject_init(&c->kobj, &bch2_fs_ktype); -+ kobject_init(&c->internal, &bch2_fs_internal_ktype); -+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); -+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); -+ kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); -+ -+ c->minor = -1; -+ c->disk_sb.fs_sb = true; -+ -+ init_rwsem(&c->state_lock); -+ mutex_init(&c->sb_lock); -+ mutex_init(&c->replicas_gc_lock); -+ mutex_init(&c->btree_root_lock); -+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); -+ -+ init_rwsem(&c->gc_lock); -+ mutex_init(&c->gc_gens_lock); -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_init(&c->times[i]); -+ -+ bch2_fs_copygc_init(c); -+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -+ bch2_fs_btree_interior_update_init_early(c); -+ bch2_fs_allocator_background_init(c); -+ bch2_fs_allocator_foreground_init(c); -+ bch2_fs_rebalance_init(c); -+ bch2_fs_quota_init(c); -+ bch2_fs_ec_init_early(c); -+ bch2_fs_move_init(c); -+ -+ INIT_LIST_HEAD(&c->list); -+ -+ mutex_init(&c->usage_scratch_lock); -+ -+ mutex_init(&c->bio_bounce_pages_lock); -+ mutex_init(&c->snapshot_table_lock); -+ -+ spin_lock_init(&c->btree_write_error_lock); -+ -+ INIT_WORK(&c->journal_seq_blacklist_gc_work, -+ bch2_blacklist_entries_gc); -+ -+ INIT_LIST_HEAD(&c->journal_iters); -+ -+ INIT_LIST_HEAD(&c->fsck_errors); -+ mutex_init(&c->fsck_error_lock); -+ -+ seqcount_init(&c->gc_pos_lock); -+ -+ seqcount_init(&c->usage_lock); -+ -+ sema_init(&c->io_in_flight, 128); -+ -+ INIT_LIST_HEAD(&c->vfs_inodes_list); -+ mutex_init(&c->vfs_inodes_lock); -+ -+ c->copy_gc_enabled = 1; -+ c->rebalance.enabled = 1; -+ c->promote_whole_extents = true; -+ -+ c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; -+ c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; -+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; -+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; -+ -+ bch2_fs_btree_cache_init_early(&c->btree_cache); -+ -+ mutex_init(&c->sectors_available_lock); -+ -+ ret = percpu_init_rwsem(&c->mark_lock); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ ret = bch2_sb_to_fs(c, sb); -+ mutex_unlock(&c->sb_lock); -+ -+ if (ret) -+ goto err; -+ -+ pr_uuid(&name, c->sb.user_uuid.b); -+ strscpy(c->name, name.buf, sizeof(c->name)); -+ printbuf_exit(&name); -+ -+ ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; -+ if (ret) -+ goto err; -+ -+ /* Compat: */ -+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && -+ !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) -+ SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); -+ -+ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && -+ !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) -+ SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); -+ -+ c->opts = bch2_opts_default; -+ ret = bch2_opts_from_sb(&c->opts, sb); -+ if (ret) -+ goto err; -+ -+ bch2_opts_apply(&c->opts, opts); -+ -+ c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; -+ if (c->opts.inodes_use_key_cache) -+ c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; -+ -+ c->block_bits = ilog2(block_sectors(c)); -+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); -+ -+ if (bch2_fs_init_fault("fs_alloc")) { -+ bch_err(c, "fs_alloc fault injected"); -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ iter_size = sizeof(struct sort_iter) + -+ (btree_blocks(c) + 1) * 2 * -+ sizeof(struct sort_iter_set); -+ -+ c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); -+ -+ if (!(c->btree_update_wq = alloc_workqueue("bcachefs", -+ WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || -+ !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->io_complete_wq = alloc_workqueue("bcachefs_io", -+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || -+ !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", -+ WQ_FREEZABLE, 0)) || -+#ifndef BCH_WRITE_REF_DEBUG -+ percpu_ref_init(&c->writes, bch2_writes_disabled, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+#endif -+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || -+ bioset_init(&c->btree_bio, 1, -+ max(offsetof(struct btree_read_bio, bio), -+ offsetof(struct btree_write_bio, wbio.bio)), -+ BIOSET_NEED_BVECS) || -+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || -+ !(c->online_reserved = alloc_percpu(u64)) || -+ !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || -+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, -+ btree_bytes(c)) || -+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || -+ !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, -+ sizeof(u64), GFP_KERNEL))) { -+ ret = -BCH_ERR_ENOMEM_fs_other_alloc; -+ goto err; -+ } -+ -+ ret = bch2_fs_counters_init(c) ?: -+ bch2_io_clock_init(&c->io_clock[READ]) ?: -+ bch2_io_clock_init(&c->io_clock[WRITE]) ?: -+ bch2_fs_journal_init(&c->journal) ?: -+ bch2_fs_replicas_init(c) ?: -+ bch2_fs_btree_cache_init(c) ?: -+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: -+ bch2_fs_btree_iter_init(c) ?: -+ bch2_fs_btree_interior_update_init(c) ?: -+ bch2_fs_buckets_waiting_for_journal_init(c) ?: -+ bch2_fs_btree_write_buffer_init(c) ?: -+ bch2_fs_subvolumes_init(c) ?: -+ bch2_fs_io_init(c) ?: -+ bch2_fs_nocow_locking_init(c) ?: -+ bch2_fs_encryption_init(c) ?: -+ bch2_fs_compress_init(c) ?: -+ bch2_fs_ec_init(c) ?: -+ bch2_fs_fsio_init(c) ?: -+ bch2_fs_fs_io_buffered_init(c); -+ bch2_fs_fs_io_direct_init(c); -+ if (ret) -+ goto err; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && -+ bch2_dev_alloc(c, i)) { -+ ret = -EEXIST; -+ goto err; -+ } -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->btree_root_journal_res, -+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); -+ bch2_dev_usage_journal_reserve(c); -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->clock_journal_res, -+ (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); -+ -+ mutex_lock(&bch_fs_list_lock); -+ ret = bch2_fs_online(c); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (ret) -+ goto err; -+out: -+ return c; -+err: -+ bch2_fs_free(c); -+ c = ERR_PTR(ret); -+ goto out; -+} -+ -+noinline_for_stack -+static void print_mount_opts(struct bch_fs *c) -+{ -+ enum bch_opt_id i; -+ struct printbuf p = PRINTBUF; -+ bool first = true; -+ -+ prt_str(&p, "mounting version "); -+ bch2_version_to_text(&p, c->sb.version); -+ -+ if (c->opts.read_only) { -+ prt_str(&p, " opts="); -+ first = false; -+ prt_printf(&p, "ro"); -+ } -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->flags & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ prt_str(&p, first ? " opts=" : ","); -+ first = false; -+ bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); -+ } -+ -+ bch_info(c, "%s", p.buf); -+ printbuf_exit(&p); -+} -+ -+int bch2_fs_start(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ time64_t now = ktime_get_real_seconds(); -+ unsigned i; -+ int ret; -+ -+ print_mount_opts(c); -+ -+ down_write(&c->state_lock); -+ -+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for_each_online_member(ca, c, i) -+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { -+ mutex_lock(&c->btree_transaction_stats[i].lock); -+ bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); -+ mutex_unlock(&c->btree_transaction_stats[i].lock); -+ } -+ -+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) -+ ? bch2_fs_recovery(c) -+ : bch2_fs_initialize(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_opts_check_may_set(c); -+ if (ret) -+ goto err; -+ -+ if (bch2_fs_init_fault("fs_start")) { -+ bch_err(c, "fs_start fault injected"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ set_bit(BCH_FS_STARTED, &c->flags); -+ -+ if (c->opts.read_only || c->opts.nochanges) { -+ bch2_fs_read_only(c); -+ } else { -+ ret = !test_bit(BCH_FS_RW, &c->flags) -+ ? bch2_fs_read_write(c) -+ : bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ ret = 0; -+out: -+ up_write(&c->state_lock); -+ return ret; -+err: -+ bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); -+ goto out; -+} -+ -+static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -+{ -+ struct bch_sb_field_members *sb_mi; -+ -+ sb_mi = bch2_sb_get_members(sb); -+ if (!sb_mi) -+ return -BCH_ERR_member_info_missing; -+ -+ if (le16_to_cpu(sb->block_size) != block_sectors(c)) -+ return -BCH_ERR_mismatched_block_size; -+ -+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) -+ return -BCH_ERR_bucket_size_too_small; -+ -+ return 0; -+} -+ -+static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) -+{ -+ struct bch_sb *newest = -+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); -+ -+ if (!uuid_equal(&fs->uuid, &sb->uuid)) -+ return -BCH_ERR_device_not_a_member_of_filesystem; -+ -+ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) -+ return -BCH_ERR_device_has_been_removed; -+ -+ if (fs->block_size != sb->block_size) -+ return -BCH_ERR_mismatched_block_size; -+ -+ return 0; -+} -+ -+/* Device startup/shutdown: */ -+ -+static void bch2_dev_release(struct kobject *kobj) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ -+ kfree(ca); -+} -+ -+static void bch2_dev_free(struct bch_dev *ca) -+{ -+ cancel_work_sync(&ca->io_error_work); -+ -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); -+ -+ if (ca->kobj.state_in_sysfs) -+ kobject_del(&ca->kobj); -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+ -+ free_percpu(ca->io_done); -+ bioset_exit(&ca->replica_set); -+ bch2_dev_buckets_free(ca); -+ free_page((unsigned long) ca->sb_read_scratch); -+ -+ bch2_time_stats_exit(&ca->io_latency[WRITE]); -+ bch2_time_stats_exit(&ca->io_latency[READ]); -+ -+ percpu_ref_exit(&ca->io_ref); -+ percpu_ref_exit(&ca->ref); -+ kobject_put(&ca->kobj); -+} -+ -+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -+{ -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (percpu_ref_is_zero(&ca->io_ref)) -+ return; -+ -+ __bch2_dev_read_only(c, ca); -+ -+ reinit_completion(&ca->io_ref_completion); -+ percpu_ref_kill(&ca->io_ref); -+ wait_for_completion(&ca->io_ref_completion); -+ -+ if (ca->kobj.state_in_sysfs) { -+ sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); -+ sysfs_remove_link(&ca->kobj, "block"); -+ } -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+} -+ -+static void bch2_dev_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); -+ -+ complete(&ca->ref_completion); -+} -+ -+static void bch2_dev_io_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); -+ -+ complete(&ca->io_ref_completion); -+} -+ -+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -+{ -+ int ret; -+ -+ if (!c->kobj.state_in_sysfs) -+ return 0; -+ -+ if (!ca->kobj.state_in_sysfs) { -+ ret = kobject_add(&ca->kobj, &c->kobj, -+ "dev-%u", ca->dev_idx); -+ if (ret) -+ return ret; -+ } -+ -+ if (ca->disk_sb.bdev) { -+ struct kobject *block = bdev_kobj(ca->disk_sb.bdev); -+ -+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); -+ if (ret) -+ return ret; -+ -+ ret = sysfs_create_link(&ca->kobj, block, "block"); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, -+ struct bch_member *member) -+{ -+ struct bch_dev *ca; -+ -+ ca = kzalloc(sizeof(*ca), GFP_KERNEL); -+ if (!ca) -+ return NULL; -+ -+ kobject_init(&ca->kobj, &bch2_dev_ktype); -+ init_completion(&ca->ref_completion); -+ init_completion(&ca->io_ref_completion); -+ -+ init_rwsem(&ca->bucket_lock); -+ -+ INIT_WORK(&ca->io_error_work, bch2_io_error_work); -+ -+ bch2_time_stats_init(&ca->io_latency[READ]); -+ bch2_time_stats_init(&ca->io_latency[WRITE]); -+ -+ ca->mi = bch2_mi_to_cpu(member); -+ ca->uuid = member->uuid; -+ -+ ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, -+ ca->mi.bucket_size / btree_sectors(c)); -+ -+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, -+ 0, GFP_KERNEL) || -+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || -+ bch2_dev_buckets_alloc(c, ca) || -+ bioset_init(&ca->replica_set, 4, -+ offsetof(struct bch_write_bio, bio), 0) || -+ !(ca->io_done = alloc_percpu(*ca->io_done))) -+ goto err; -+ -+ return ca; -+err: -+ bch2_dev_free(ca); -+ return NULL; -+} -+ -+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, -+ unsigned dev_idx) -+{ -+ ca->dev_idx = dev_idx; -+ __set_bit(ca->dev_idx, ca->self.d); -+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); -+ -+ ca->fs = c; -+ rcu_assign_pointer(c->devs[ca->dev_idx], ca); -+ -+ if (bch2_dev_sysfs_online(c, ca)) -+ pr_warn("error creating sysfs objects"); -+} -+ -+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -+{ -+ struct bch_member *member = -+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; -+ struct bch_dev *ca = NULL; -+ int ret = 0; -+ -+ if (bch2_fs_init_fault("dev_alloc")) -+ goto err; -+ -+ ca = __bch2_dev_alloc(c, member); -+ if (!ca) -+ goto err; -+ -+ ca->fs = c; -+ -+ bch2_dev_attach(c, ca, dev_idx); -+ return ret; -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ return -BCH_ERR_ENOMEM_dev_alloc; -+} -+ -+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -+{ -+ unsigned ret; -+ -+ if (bch2_dev_is_online(ca)) { -+ bch_err(ca, "already have device online in slot %u", -+ sb->sb->dev_idx); -+ return -BCH_ERR_device_already_online; -+ } -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "cannot online: device too small"); -+ return -BCH_ERR_device_size_too_small; -+ } -+ -+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); -+ -+ ret = bch2_dev_journal_init(ca, sb->sb); -+ if (ret) -+ return ret; -+ -+ /* Commit: */ -+ ca->disk_sb = *sb; -+ memset(sb, 0, sizeof(*sb)); -+ -+ ca->dev = ca->disk_sb.bdev->bd_dev; -+ -+ percpu_ref_reinit(&ca->io_ref); -+ -+ return 0; -+} -+ -+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (le64_to_cpu(sb->sb->seq) > -+ le64_to_cpu(c->disk_sb.sb->seq)) -+ bch2_sb_to_fs(c, sb->sb); -+ -+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || -+ !c->devs[sb->sb->dev_idx]); -+ -+ ca = bch_dev_locked(c, sb->sb->dev_idx); -+ -+ ret = __bch2_dev_attach_bdev(ca, sb); -+ if (ret) -+ return ret; -+ -+ bch2_dev_sysfs_online(c, ca); -+ -+ if (c->sb.nr_devices == 1) -+ snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); -+ snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); -+ -+ rebalance_wakeup(c); -+ return 0; -+} -+ -+/* Device management: */ -+ -+/* -+ * Note: this function is also used by the error paths - when a particular -+ * device sees an error, we call it to determine whether we can just set the -+ * device RO, or - if this function returns false - we'll set the whole -+ * filesystem RO: -+ * -+ * XXX: maybe we should be more explicit about whether we're changing state -+ * because we got an error or what have you? -+ */ -+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_devs_mask new_online_devs; -+ struct bch_dev *ca2; -+ int i, nr_rw = 0, required; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ switch (new_state) { -+ case BCH_MEMBER_STATE_rw: -+ return true; -+ case BCH_MEMBER_STATE_ro: -+ if (ca->mi.state != BCH_MEMBER_STATE_rw) -+ return true; -+ -+ /* do we have enough devices to write to? */ -+ for_each_member_device(ca2, c, i) -+ if (ca2 != ca) -+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; -+ -+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) -+ ? c->opts.metadata_replicas -+ : c->opts.metadata_replicas_required, -+ !(flags & BCH_FORCE_IF_DATA_DEGRADED) -+ ? c->opts.data_replicas -+ : c->opts.data_replicas_required); -+ -+ return nr_rw >= required; -+ case BCH_MEMBER_STATE_failed: -+ case BCH_MEMBER_STATE_spare: -+ if (ca->mi.state != BCH_MEMBER_STATE_rw && -+ ca->mi.state != BCH_MEMBER_STATE_ro) -+ return true; -+ -+ /* do we have enough devices to read from? */ -+ new_online_devs = bch2_online_devs(c); -+ __clear_bit(ca->dev_idx, new_online_devs.d); -+ -+ return bch2_have_enough_devs(c, new_online_devs, flags, false); -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_fs_may_start(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned i, flags = 0; -+ -+ if (c->opts.very_degraded) -+ flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; -+ -+ if (c->opts.degraded) -+ flags |= BCH_FORCE_IF_DEGRADED; -+ -+ if (!c->opts.degraded && -+ !c->opts.very_degraded) { -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) -+ continue; -+ -+ ca = bch_dev_locked(c, i); -+ -+ if (!bch2_dev_is_online(ca) && -+ (ca->mi.state == BCH_MEMBER_STATE_rw || -+ ca->mi.state == BCH_MEMBER_STATE_ro)) { -+ mutex_unlock(&c->sb_lock); -+ return false; -+ } -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); -+} -+ -+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -+{ -+ /* -+ * The allocator thread itself allocates btree nodes, so stop it first: -+ */ -+ bch2_dev_allocator_remove(c, ca); -+ bch2_dev_journal_stop(&c->journal, ca); -+} -+ -+static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); -+ -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+} -+ -+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ int ret = 0; -+ -+ if (ca->mi.state == new_state) -+ return 0; -+ -+ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) -+ return -BCH_ERR_device_state_not_allowed; -+ -+ if (new_state != BCH_MEMBER_STATE_rw) -+ __bch2_dev_read_only(c, ca); -+ -+ bch_notice(ca, "%s", bch2_member_states[new_state]); -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (new_state == BCH_MEMBER_STATE_rw) -+ __bch2_dev_read_write(c, ca); -+ -+ rebalance_wakeup(c); -+ -+ return ret; -+} -+ -+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ int ret; -+ -+ down_write(&c->state_lock); -+ ret = __bch2_dev_set_state(c, ca, new_state, flags); -+ up_write(&c->state_lock); -+ -+ return ret; -+} -+ -+/* Device add/removal: */ -+ -+static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bpos start = POS(ca->dev_idx, 0); -+ struct bpos end = POS(ca->dev_idx, U64_MAX); -+ int ret; -+ -+ /* -+ * We clear the LRU and need_discard btrees first so that we don't race -+ * with bch2_do_invalidates() and bch2_do_discards() -+ */ -+ ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, -+ BTREE_TRIGGER_NORUN, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, -+ BTREE_TRIGGER_NORUN, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, -+ BTREE_TRIGGER_NORUN, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, -+ BTREE_TRIGGER_NORUN, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, -+ BTREE_TRIGGER_NORUN, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, -+ BTREE_TRIGGER_NORUN, NULL); -+ if (ret) -+ bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); -+ -+ return ret; -+} -+ -+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ unsigned dev_idx = ca->dev_idx, data; -+ int ret; -+ -+ down_write(&c->state_lock); -+ -+ /* -+ * We consume a reference to ca->ref, regardless of whether we succeed -+ * or fail: -+ */ -+ percpu_ref_put(&ca->ref); -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { -+ bch_err(ca, "Cannot remove without losing data"); -+ ret = -BCH_ERR_device_state_not_allowed; -+ goto err; -+ } -+ -+ __bch2_dev_read_only(c, ca); -+ -+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); -+ if (ret) { -+ bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ret = bch2_dev_remove_alloc(c, ca); -+ if (ret) { -+ bch_err(ca, "Remove failed, error deleting alloc info"); -+ goto err; -+ } -+ -+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); -+ if (ret) { -+ bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ret = bch2_journal_flush(&c->journal); -+ if (ret) { -+ bch_err(ca, "Remove failed, journal error"); -+ goto err; -+ } -+ -+ ret = bch2_replicas_gc2(c); -+ if (ret) { -+ bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ data = bch2_dev_has_data(c, ca); -+ if (data) { -+ struct printbuf data_has = PRINTBUF; -+ -+ prt_bitflags(&data_has, bch2_data_types, data); -+ bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); -+ printbuf_exit(&data_has); -+ ret = -EBUSY; -+ goto err; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ mutex_lock(&c->sb_lock); -+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); -+ mutex_unlock(&c->sb_lock); -+ -+ percpu_ref_kill(&ca->ref); -+ wait_for_completion(&ca->ref_completion); -+ -+ bch2_dev_free(ca); -+ -+ /* -+ * At this point the device object has been removed in-core, but the -+ * on-disk journal might still refer to the device index via sb device -+ * usage entries. Recovery fails if it sees usage information for an -+ * invalid device. Flush journal pins to push the back of the journal -+ * past now invalid device index references before we update the -+ * superblock, but after the device object has been removed so any -+ * further journal writes elide usage info for the device. -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ /* -+ * Free this device's slot in the bch_member array - all pointers to -+ * this device must be gone: -+ */ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); -+ -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+ -+ bch2_dev_usage_journal_reserve(c); -+ return 0; -+err: -+ if (ca->mi.state == BCH_MEMBER_STATE_rw && -+ !percpu_ref_is_zero(&ca->io_ref)) -+ __bch2_dev_read_write(c, ca); -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+/* Add new device to running filesystem: */ -+int bch2_dev_add(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb; -+ struct bch_dev *ca = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member dev_mi; -+ unsigned dev_idx, nr_devices, u64s; -+ struct printbuf errbuf = PRINTBUF; -+ struct printbuf label = PRINTBUF; -+ int ret; -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) { -+ bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; -+ -+ if (BCH_MEMBER_GROUP(&dev_mi)) { -+ bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); -+ if (label.allocation_failure) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ ret = bch2_dev_may_add(sb.sb, c); -+ if (ret) { -+ bch_err(c, "device add error: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ca = __bch2_dev_alloc(c, &dev_mi); -+ if (!ca) { -+ bch2_free_super(&sb); -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ bch2_dev_usage_init(ca); -+ -+ ret = __bch2_dev_attach_bdev(ca, &sb); -+ if (ret) { -+ bch2_dev_free(ca); -+ goto err; -+ } -+ -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ bch_err(c, "device add error: journal alloc failed"); -+ goto err; -+ } -+ -+ down_write(&c->state_lock); -+ mutex_lock(&c->sb_lock); -+ -+ ret = bch2_sb_from_fs(c, ca); -+ if (ret) { -+ bch_err(c, "device add error: new device superblock too small"); -+ goto err_unlock; -+ } -+ -+ mi = bch2_sb_get_members(ca->disk_sb.sb); -+ -+ if (!bch2_sb_resize_members(&ca->disk_sb, -+ le32_to_cpu(mi->field.u64s) + -+ sizeof(dev_mi) / sizeof(u64))) { -+ bch_err(c, "device add error: new device superblock too small"); -+ ret = -BCH_ERR_ENOSPC_sb_members; -+ goto err_unlock; -+ } -+ -+ if (dynamic_fault("bcachefs:add:no_slot")) -+ goto no_slot; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) -+ goto have_slot; -+no_slot: -+ bch_err(c, "device add error: already have maximum number of devices"); -+ ret = -BCH_ERR_ENOSPC_sb_members; -+ goto err_unlock; -+ -+have_slot: -+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); -+ u64s = (sizeof(struct bch_sb_field_members) + -+ sizeof(struct bch_member) * nr_devices) / sizeof(u64); -+ -+ mi = bch2_sb_resize_members(&c->disk_sb, u64s); -+ if (!mi) { -+ bch_err(c, "device add error: no room in superblock for member info"); -+ ret = -BCH_ERR_ENOSPC_sb_members; -+ goto err_unlock; -+ } -+ -+ /* success: */ -+ -+ mi->members[dev_idx] = dev_mi; -+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); -+ c->disk_sb.sb->nr_devices = nr_devices; -+ -+ ca->disk_sb.sb->dev_idx = dev_idx; -+ bch2_dev_attach(c, ca, dev_idx); -+ -+ if (BCH_MEMBER_GROUP(&dev_mi)) { -+ ret = __bch2_dev_group_set(c, ca, label.buf); -+ if (ret) { -+ bch_err(c, "device add error: error setting label"); -+ goto err_unlock; -+ } -+ } -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_dev_usage_journal_reserve(c); -+ -+ ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) { -+ bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); -+ goto err_late; -+ } -+ -+ ret = bch2_fs_freespace_init(c); -+ if (ret) { -+ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); -+ goto err_late; -+ } -+ -+ ca->new_fs_bucket_idx = 0; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) -+ __bch2_dev_read_write(c, ca); -+ -+ up_write(&c->state_lock); -+ return 0; -+ -+err_unlock: -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ bch2_free_super(&sb); -+ printbuf_exit(&label); -+ printbuf_exit(&errbuf); -+ return ret; -+err_late: -+ up_write(&c->state_lock); -+ ca = NULL; -+ goto err; -+} -+ -+/* Hot add existing device to running filesystem: */ -+int bch2_dev_online(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb = { NULL }; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ int ret; -+ -+ down_write(&c->state_lock); -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) { -+ up_write(&c->state_lock); -+ return ret; -+ } -+ -+ dev_idx = sb.sb->dev_idx; -+ -+ ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); -+ if (ret) { -+ bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ret = bch2_dev_attach_bdev(c, &sb); -+ if (ret) -+ goto err; -+ -+ ca = bch_dev_locked(c, dev_idx); -+ -+ ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) { -+ bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", -+ path, bch2_err_str(ret)); -+ goto err; -+ } -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_rw) -+ __bch2_dev_read_write(c, ca); -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ mi->members[ca->dev_idx].last_mount = -+ cpu_to_le64(ktime_get_real_seconds()); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ ret = bch2_fs_freespace_init(c); -+ if (ret) -+ bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); -+ -+ up_write(&c->state_lock); -+ return 0; -+err: -+ up_write(&c->state_lock); -+ bch2_free_super(&sb); -+ return ret; -+} -+ -+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ down_write(&c->state_lock); -+ -+ if (!bch2_dev_is_online(ca)) { -+ bch_err(ca, "Already offline"); -+ up_write(&c->state_lock); -+ return 0; -+ } -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { -+ bch_err(ca, "Cannot offline required disk"); -+ up_write(&c->state_lock); -+ return -BCH_ERR_device_state_not_allowed; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ up_write(&c->state_lock); -+ return 0; -+} -+ -+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bch_member *mi; -+ int ret = 0; -+ -+ down_write(&c->state_lock); -+ -+ if (nbuckets < ca->mi.nbuckets) { -+ bch_err(ca, "Cannot shrink yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (bch2_dev_is_online(ca) && -+ get_capacity(ca->disk_sb.bdev->bd_disk) < -+ ca->mi.bucket_size * nbuckets) { -+ bch_err(ca, "New size larger than device"); -+ ret = -BCH_ERR_device_size_too_small; -+ goto err; -+ } -+ -+ ret = bch2_dev_buckets_resize(c, ca, nbuckets); -+ if (ret) { -+ bch_err(ca, "Resize error: %s", bch2_err_str(ret)); -+ goto err; -+ } -+ -+ ret = bch2_trans_mark_dev_sb(c, ca); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ mi->nbuckets = cpu_to_le64(nbuckets); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_recalc_capacity(c); -+err: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+/* return with ref on ca->ref: */ -+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, NULL) -+ if (!strcmp(name, ca->name)) -+ goto found; -+ ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); -+found: -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* Filesystem open: */ -+ -+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, -+ struct bch_opts opts) -+{ -+ struct bch_sb_handle *sb = NULL; -+ struct bch_fs *c = NULL; -+ struct bch_sb_field_members *mi; -+ unsigned i, best_sb = 0; -+ struct printbuf errbuf = PRINTBUF; -+ int ret = 0; -+ -+ if (!try_module_get(THIS_MODULE)) -+ return ERR_PTR(-ENODEV); -+ -+ if (!nr_devices) { -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); -+ if (!sb) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_read_super(devices[i], &opts, &sb[i]); -+ if (ret) -+ goto err; -+ -+ } -+ -+ for (i = 1; i < nr_devices; i++) -+ if (le64_to_cpu(sb[i].sb->seq) > -+ le64_to_cpu(sb[best_sb].sb->seq)) -+ best_sb = i; -+ -+ mi = bch2_sb_get_members(sb[best_sb].sb); -+ -+ i = 0; -+ while (i < nr_devices) { -+ if (i != best_sb && -+ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { -+ pr_info("%pg has been removed, skipping", sb[i].bdev); -+ bch2_free_super(&sb[i]); -+ array_remove_item(sb, nr_devices, i); -+ continue; -+ } -+ -+ ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); -+ if (ret) -+ goto err_print; -+ i++; -+ } -+ -+ c = bch2_fs_alloc(sb[best_sb].sb, opts); -+ if (IS_ERR(c)) { -+ ret = PTR_ERR(c); -+ goto err; -+ } -+ -+ down_write(&c->state_lock); -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_dev_attach_bdev(c, &sb[i]); -+ if (ret) { -+ up_write(&c->state_lock); -+ goto err; -+ } -+ } -+ up_write(&c->state_lock); -+ -+ if (!bch2_fs_may_start(c)) { -+ ret = -BCH_ERR_insufficient_devices_to_start; -+ goto err_print; -+ } -+ -+ if (!c->opts.nostart) { -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+out: -+ kfree(sb); -+ printbuf_exit(&errbuf); -+ module_put(THIS_MODULE); -+ return c; -+err_print: -+ pr_err("bch_fs_open err opening %s: %s", -+ devices[0], bch2_err_str(ret)); -+err: -+ if (!IS_ERR_OR_NULL(c)) -+ bch2_fs_stop(c); -+ if (sb) -+ for (i = 0; i < nr_devices; i++) -+ bch2_free_super(&sb[i]); -+ c = ERR_PTR(ret); -+ goto out; -+} -+ -+/* Global interfaces/init */ -+ -+static void bcachefs_exit(void) -+{ -+ bch2_debug_exit(); -+ bch2_vfs_exit(); -+ bch2_chardev_exit(); -+ bch2_btree_key_cache_exit(); -+ if (bcachefs_kset) -+ kset_unregister(bcachefs_kset); -+} -+ -+static int __init bcachefs_init(void) -+{ -+ bch2_bkey_pack_test(); -+ -+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || -+ bch2_btree_key_cache_init() || -+ bch2_chardev_init() || -+ bch2_vfs_init() || -+ bch2_debug_init()) -+ goto err; -+ -+ return 0; -+err: -+ bcachefs_exit(); -+ return -ENOMEM; -+} -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ bool bch2_##name; \ -+ module_param_named(name, bch2_##name, bool, 0644); \ -+ MODULE_PARM_DESC(name, description); -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+__maybe_unused -+static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -+module_param_named(version, bch2_metadata_version, uint, 0400); -+ -+module_exit(bcachefs_exit); -+module_init(bcachefs_init); -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -new file mode 100644 -index 000000000..bf762df18 ---- /dev/null -+++ b/fs/bcachefs/super.h -@@ -0,0 +1,52 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_H -+#define _BCACHEFS_SUPER_H -+ -+#include "extents.h" -+ -+#include "bcachefs_ioctl.h" -+ -+#include -+ -+struct bch_fs *bch2_dev_to_fs(dev_t); -+struct bch_fs *bch2_uuid_to_fs(__uuid_t); -+ -+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+ -+int bch2_dev_fail(struct bch_dev *, int); -+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_add(struct bch_fs *, const char *); -+int bch2_dev_online(struct bch_fs *, const char *); -+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *); -+void bch2_fs_read_only(struct bch_fs *); -+ -+int bch2_fs_read_write(struct bch_fs *); -+int bch2_fs_read_write_early(struct bch_fs *); -+ -+/* -+ * Only for use in the recovery/fsck path: -+ */ -+static inline void bch2_fs_lazy_rw(struct bch_fs *c) -+{ -+ if (!test_bit(BCH_FS_RW, &c->flags) && -+ !test_bit(BCH_FS_WAS_RW, &c->flags)) -+ bch2_fs_read_write_early(c); -+} -+ -+void __bch2_fs_stop(struct bch_fs *); -+void bch2_fs_free(struct bch_fs *); -+void bch2_fs_stop(struct bch_fs *); -+ -+int bch2_fs_start(struct bch_fs *); -+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+ -+#endif /* _BCACHEFS_SUPER_H */ -diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h -new file mode 100644 -index 000000000..08faeedba ---- /dev/null -+++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,52 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_TYPES_H -+#define _BCACHEFS_SUPER_TYPES_H -+ -+struct bch_sb_handle { -+ struct bch_sb *sb; -+ struct block_device *bdev; -+ struct bio *bio; -+ void *holder; -+ size_t buffer_size; -+ fmode_t mode; -+ unsigned have_layout:1; -+ unsigned have_bio:1; -+ unsigned fs_sb:1; -+ u64 seq; -+}; -+ -+struct bch_devs_mask { -+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -+}; -+ -+struct bch_devs_list { -+ u8 nr; -+ u8 devs[BCH_BKEY_PTRS_MAX]; -+}; -+ -+struct bch_member_cpu { -+ u64 nbuckets; /* device size */ -+ u16 first_bucket; /* index of first bucket used */ -+ u16 bucket_size; /* sectors */ -+ u16 group; -+ u8 state; -+ u8 discard; -+ u8 data_allowed; -+ u8 durability; -+ u8 freespace_initialized; -+ u8 valid; -+}; -+ -+struct bch_disk_group_cpu { -+ bool deleted; -+ u16 parent; -+ struct bch_devs_mask devs; -+}; -+ -+struct bch_disk_groups_cpu { -+ struct rcu_head rcu; -+ unsigned nr; -+ struct bch_disk_group_cpu entries[]; -+}; -+ -+#endif /* _BCACHEFS_SUPER_TYPES_H */ -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -new file mode 100644 -index 000000000..941f4bcb9 ---- /dev/null -+++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1059 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcache sysfs interfaces -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "sysfs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "inode.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "movinggc.h" -+#include "nocow_locking.h" -+#include "opts.h" -+#include "rebalance.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "tests.h" -+ -+#include -+#include -+#include -+ -+#include "util.h" -+ -+#define SYSFS_OPS(type) \ -+const struct sysfs_ops type ## _sysfs_ops = { \ -+ .show = type ## _show, \ -+ .store = type ## _store \ -+} -+ -+#define SHOW(fn) \ -+static ssize_t fn ## _to_text(struct printbuf *, \ -+ struct kobject *, struct attribute *); \ -+ \ -+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ -+ char *buf) \ -+{ \ -+ struct printbuf out = PRINTBUF; \ -+ ssize_t ret = fn ## _to_text(&out, kobj, attr); \ -+ \ -+ if (out.pos && out.buf[out.pos - 1] != '\n') \ -+ prt_newline(&out); \ -+ \ -+ if (!ret && out.allocation_failure) \ -+ ret = -ENOMEM; \ -+ \ -+ if (!ret) { \ -+ ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ -+ memcpy(buf, out.buf, ret); \ -+ } \ -+ printbuf_exit(&out); \ -+ return bch2_err_class(ret); \ -+} \ -+ \ -+static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ -+ struct attribute *attr) -+ -+#define STORE(fn) \ -+static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ -+ const char *, size_t); \ -+ \ -+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ -+ const char *buf, size_t size) \ -+{ \ -+ return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ -+} \ -+ \ -+static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ -+ const char *buf, size_t size) -+ -+#define __sysfs_attribute(_name, _mode) \ -+ static struct attribute sysfs_##_name = \ -+ { .name = #_name, .mode = _mode } -+ -+#define write_attribute(n) __sysfs_attribute(n, 0200) -+#define read_attribute(n) __sysfs_attribute(n, 0444) -+#define rw_attribute(n) __sysfs_attribute(n, 0644) -+ -+#define sysfs_printf(file, fmt, ...) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ prt_printf(out, fmt "\n", __VA_ARGS__); \ -+} while (0) -+ -+#define sysfs_print(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ snprint(out, var); \ -+} while (0) -+ -+#define sysfs_hprint(file, val) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ prt_human_readable_s64(out, val); \ -+} while (0) -+ -+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -+#define var_print(_var) sysfs_print(_var, var(_var)) -+#define var_hprint(_var) sysfs_hprint(_var, var(_var)) -+ -+#define sysfs_strtoul(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe(buf, var) ?: (ssize_t) size; \ -+} while (0) -+ -+#define sysfs_strtoul_clamp(file, var, min, max) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe_clamp(buf, var, min, max) \ -+ ?: (ssize_t) size; \ -+} while (0) -+ -+#define strtoul_or_return(cp) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define strtoul_restrict_or_return(cp, min, max) \ -+({ \ -+ unsigned long __v = 0; \ -+ int _r = strtoul_safe_restrict(cp, __v, min, max); \ -+ if (_r) \ -+ return _r; \ -+ __v; \ -+}) -+ -+#define strtoi_h_or_return(cp) \ -+({ \ -+ u64 _v; \ -+ int _r = strtoi_h(cp, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define sysfs_hatoi(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoi_h(buf, &var) ?: (ssize_t) size; \ -+} while (0) -+ -+write_attribute(trigger_gc); -+write_attribute(trigger_discards); -+write_attribute(trigger_invalidates); -+write_attribute(prune_cache); -+write_attribute(btree_wakeup); -+rw_attribute(btree_gc_periodic); -+rw_attribute(gc_gens_pos); -+ -+read_attribute(uuid); -+read_attribute(minor); -+read_attribute(bucket_size); -+read_attribute(first_bucket); -+read_attribute(nbuckets); -+rw_attribute(durability); -+read_attribute(iodone); -+ -+read_attribute(io_latency_read); -+read_attribute(io_latency_write); -+read_attribute(io_latency_stats_read); -+read_attribute(io_latency_stats_write); -+read_attribute(congested); -+ -+read_attribute(btree_write_stats); -+ -+read_attribute(btree_cache_size); -+read_attribute(compression_stats); -+read_attribute(journal_debug); -+read_attribute(btree_updates); -+read_attribute(btree_cache); -+read_attribute(btree_key_cache); -+read_attribute(stripes_heap); -+read_attribute(open_buckets); -+read_attribute(open_buckets_partial); -+read_attribute(write_points); -+read_attribute(nocow_lock_table); -+ -+#ifdef BCH_WRITE_REF_DEBUG -+read_attribute(write_refs); -+ -+static const char * const bch2_write_refs[] = { -+#define x(n) #n, -+ BCH_WRITE_REFS() -+#undef x -+ NULL -+}; -+ -+static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ bch2_printbuf_tabstop_push(out, 24); -+ -+ for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { -+ prt_str(out, bch2_write_refs[i]); -+ prt_tab(out); -+ prt_printf(out, "%li", atomic_long_read(&c->writes[i])); -+ prt_newline(out); -+ } -+} -+#endif -+ -+read_attribute(internal_uuid); -+read_attribute(disk_groups); -+ -+read_attribute(has_data); -+read_attribute(alloc_debug); -+ -+#define x(t, n, ...) read_attribute(t); -+BCH_PERSISTENT_COUNTERS() -+#undef x -+ -+rw_attribute(discard); -+rw_attribute(label); -+ -+rw_attribute(copy_gc_enabled); -+read_attribute(copy_gc_wait); -+ -+rw_attribute(rebalance_enabled); -+sysfs_pd_controller_attribute(rebalance); -+read_attribute(rebalance_work); -+rw_attribute(promote_whole_extents); -+ -+read_attribute(new_stripes); -+ -+read_attribute(io_timers_read); -+read_attribute(io_timers_write); -+ -+read_attribute(moving_ctxts); -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+write_attribute(perf_test); -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#define x(_name) \ -+ static struct attribute sysfs_time_stat_##_name = \ -+ { .name = #_name, .mode = 0444 }; -+ BCH_TIME_STATS() -+#undef x -+ -+static struct attribute sysfs_state_rw = { -+ .name = "state", -+ .mode = 0444, -+}; -+ -+static size_t bch2_btree_cache_size(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct btree *b; -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_for_each_entry(b, &c->btree_cache.live, list) -+ ret += btree_bytes(c); -+ -+ mutex_unlock(&c->btree_cache.lock); -+ return ret; -+} -+ -+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ enum btree_id id; -+ u64 nr_uncompressed_extents = 0, -+ nr_compressed_extents = 0, -+ nr_incompressible_extents = 0, -+ uncompressed_sectors = 0, -+ incompressible_sectors = 0, -+ compressed_sectors_compressed = 0, -+ compressed_sectors_uncompressed = 0; -+ int ret; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ if (!btree_type_has_ptrs(id)) -+ continue; -+ -+ for_each_btree_key(&trans, iter, id, POS_MIN, -+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bool compressed = false, uncompressed = false, incompressible = false; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ switch (p.crc.compression_type) { -+ case BCH_COMPRESSION_TYPE_none: -+ uncompressed = true; -+ uncompressed_sectors += k.k->size; -+ break; -+ case BCH_COMPRESSION_TYPE_incompressible: -+ incompressible = true; -+ incompressible_sectors += k.k->size; -+ break; -+ default: -+ compressed_sectors_compressed += -+ p.crc.compressed_size; -+ compressed_sectors_uncompressed += -+ p.crc.uncompressed_size; -+ compressed = true; -+ break; -+ } -+ } -+ -+ if (incompressible) -+ nr_incompressible_extents++; -+ else if (uncompressed) -+ nr_uncompressed_extents++; -+ else if (compressed) -+ nr_compressed_extents++; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ return ret; -+ -+ prt_printf(out, "uncompressed:\n"); -+ prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); -+ prt_printf(out, " size: "); -+ prt_human_readable_u64(out, uncompressed_sectors << 9); -+ prt_printf(out, "\n"); -+ -+ prt_printf(out, "compressed:\n"); -+ prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); -+ prt_printf(out, " compressed size: "); -+ prt_human_readable_u64(out, compressed_sectors_compressed << 9); -+ prt_printf(out, "\n"); -+ prt_printf(out, " uncompressed size: "); -+ prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); -+ prt_printf(out, "\n"); -+ -+ prt_printf(out, "incompressible:\n"); -+ prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); -+ prt_printf(out, " size: "); -+ prt_human_readable_u64(out, incompressible_sectors << 9); -+ prt_printf(out, "\n"); -+ return 0; -+} -+ -+static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); -+ bch2_bpos_to_text(out, c->gc_gens_pos); -+ prt_printf(out, "\n"); -+} -+ -+static void bch2_btree_wakeup_all(struct bch_fs *c) -+{ -+ struct btree_trans *trans; -+ -+ seqmutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); -+ -+ if (b) -+ six_lock_wakeup_all(&b->lock); -+ -+ } -+ seqmutex_unlock(&c->btree_trans_lock); -+} -+ -+SHOW(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ sysfs_print(minor, c->minor); -+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); -+ -+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); -+ -+ if (attr == &sysfs_btree_write_stats) -+ bch2_btree_write_stats_to_text(out, c); -+ -+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); -+ -+ if (attr == &sysfs_gc_gens_pos) -+ bch2_gc_gens_pos_to_text(out, c); -+ -+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ -+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); -+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ -+ -+ if (attr == &sysfs_copy_gc_wait) -+ bch2_copygc_wait_to_text(out, c); -+ -+ if (attr == &sysfs_rebalance_work) -+ bch2_rebalance_work_to_text(out, c); -+ -+ sysfs_print(promote_whole_extents, c->promote_whole_extents); -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_journal_debug) -+ bch2_journal_debug_to_text(out, &c->journal); -+ -+ if (attr == &sysfs_btree_updates) -+ bch2_btree_updates_to_text(out, c); -+ -+ if (attr == &sysfs_btree_cache) -+ bch2_btree_cache_to_text(out, &c->btree_cache); -+ -+ if (attr == &sysfs_btree_key_cache) -+ bch2_btree_key_cache_to_text(out, &c->btree_key_cache); -+ -+ if (attr == &sysfs_stripes_heap) -+ bch2_stripes_heap_to_text(out, c); -+ -+ if (attr == &sysfs_open_buckets) -+ bch2_open_buckets_to_text(out, c); -+ -+ if (attr == &sysfs_open_buckets_partial) -+ bch2_open_buckets_partial_to_text(out, c); -+ -+ if (attr == &sysfs_write_points) -+ bch2_write_points_to_text(out, c); -+ -+ if (attr == &sysfs_compression_stats) -+ bch2_compression_stats_to_text(out, c); -+ -+ if (attr == &sysfs_new_stripes) -+ bch2_new_stripes_to_text(out, c); -+ -+ if (attr == &sysfs_io_timers_read) -+ bch2_io_timers_to_text(out, &c->io_clock[READ]); -+ -+ if (attr == &sysfs_io_timers_write) -+ bch2_io_timers_to_text(out, &c->io_clock[WRITE]); -+ -+ if (attr == &sysfs_moving_ctxts) -+ bch2_fs_moving_ctxts_to_text(out, c); -+ -+#ifdef BCH_WRITE_REF_DEBUG -+ if (attr == &sysfs_write_refs) -+ bch2_write_refs_to_text(out, c); -+#endif -+ -+ if (attr == &sysfs_nocow_lock_table) -+ bch2_nocow_locks_to_text(out, &c->nocow_locks); -+ -+ if (attr == &sysfs_disk_groups) -+ bch2_disk_groups_to_text(out, c); -+ -+ return 0; -+} -+ -+STORE(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ if (attr == &sysfs_btree_gc_periodic) { -+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) -+ ?: (ssize_t) size; -+ -+ wake_up_process(c->gc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_copy_gc_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) -+ ?: (ssize_t) size; -+ -+ if (c->copygc_thread) -+ wake_up_process(c->copygc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_rebalance_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) -+ ?: (ssize_t) size; -+ -+ rebalance_wakeup(c); -+ return ret; -+ } -+ -+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); -+ -+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); -+ -+ /* Debugging: */ -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ /* Debugging: */ -+ -+ if (!test_bit(BCH_FS_RW, &c->flags)) -+ return -EROFS; -+ -+ if (attr == &sysfs_prune_cache) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = strtoul_or_return(buf); -+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); -+ } -+ -+ if (attr == &sysfs_btree_wakeup) -+ bch2_btree_wakeup_all(c); -+ -+ if (attr == &sysfs_trigger_gc) { -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ down_read(&c->state_lock); -+ bch2_gc(c, false, false); -+ up_read(&c->state_lock); -+#else -+ bch2_gc_gens(c); -+#endif -+ } -+ -+ if (attr == &sysfs_trigger_discards) -+ bch2_do_discards(c); -+ -+ if (attr == &sysfs_trigger_invalidates) -+ bch2_do_invalidates(c); -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ if (attr == &sysfs_perf_test) { -+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; -+ char *test = strsep(&p, " \t\n"); -+ char *nr_str = strsep(&p, " \t\n"); -+ char *threads_str = strsep(&p, " \t\n"); -+ unsigned threads; -+ u64 nr; -+ int ret = -EINVAL; -+ -+ if (threads_str && -+ !(ret = kstrtouint(threads_str, 10, &threads)) && -+ !(ret = bch2_strtoull_h(nr_str, &nr))) -+ ret = bch2_btree_perf_test(c, test, nr, threads); -+ kfree(tmp); -+ -+ if (ret) -+ size = ret; -+ } -+#endif -+ return size; -+} -+SYSFS_OPS(bch2_fs); -+ -+struct attribute *bch2_fs_files[] = { -+ &sysfs_minor, -+ &sysfs_btree_cache_size, -+ &sysfs_btree_write_stats, -+ -+ &sysfs_promote_whole_extents, -+ -+ &sysfs_compression_stats, -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ &sysfs_perf_test, -+#endif -+ NULL -+}; -+ -+/* counters dir */ -+ -+SHOW(bch2_fs_counters) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); -+ u64 counter = 0; -+ u64 counter_since_mount = 0; -+ -+ printbuf_tabstop_push(out, 32); -+ -+ #define x(t, ...) \ -+ if (attr == &sysfs_##t) { \ -+ counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ -+ counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ -+ prt_printf(out, "since mount:"); \ -+ prt_tab(out); \ -+ prt_human_readable_u64(out, counter_since_mount); \ -+ prt_newline(out); \ -+ \ -+ prt_printf(out, "since filesystem creation:"); \ -+ prt_tab(out); \ -+ prt_human_readable_u64(out, counter); \ -+ prt_newline(out); \ -+ } -+ BCH_PERSISTENT_COUNTERS() -+ #undef x -+ return 0; -+} -+ -+STORE(bch2_fs_counters) { -+ return 0; -+} -+ -+SYSFS_OPS(bch2_fs_counters); -+ -+struct attribute *bch2_fs_counters_files[] = { -+#define x(t, ...) \ -+ &sysfs_##t, -+ BCH_PERSISTENT_COUNTERS() -+#undef x -+ NULL -+}; -+/* internal dir - just a wrapper */ -+ -+SHOW(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ -+ return bch2_fs_to_text(out, &c->kobj, attr); -+} -+ -+STORE(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ -+ return bch2_fs_store(&c->kobj, attr, buf, size); -+} -+SYSFS_OPS(bch2_fs_internal); -+ -+struct attribute *bch2_fs_internal_files[] = { -+ &sysfs_journal_debug, -+ &sysfs_btree_updates, -+ &sysfs_btree_cache, -+ &sysfs_btree_key_cache, -+ &sysfs_new_stripes, -+ &sysfs_stripes_heap, -+ &sysfs_open_buckets, -+ &sysfs_open_buckets_partial, -+ &sysfs_write_points, -+#ifdef BCH_WRITE_REF_DEBUG -+ &sysfs_write_refs, -+#endif -+ &sysfs_nocow_lock_table, -+ &sysfs_io_timers_read, -+ &sysfs_io_timers_write, -+ -+ &sysfs_trigger_gc, -+ &sysfs_trigger_discards, -+ &sysfs_trigger_invalidates, -+ &sysfs_prune_cache, -+ &sysfs_btree_wakeup, -+ -+ &sysfs_gc_gens_pos, -+ -+ &sysfs_copy_gc_enabled, -+ &sysfs_copy_gc_wait, -+ -+ &sysfs_rebalance_enabled, -+ &sysfs_rebalance_work, -+ sysfs_pd_controller_files(rebalance), -+ -+ &sysfs_moving_ctxts, -+ -+ &sysfs_internal_uuid, -+ -+ &sysfs_disk_groups, -+ NULL -+}; -+ -+/* options */ -+ -+SHOW(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int id = opt - bch2_opt_table; -+ u64 v = bch2_opt_get_by_id(&c->opts, id); -+ -+ bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); -+ prt_char(out, '\n'); -+ -+ return 0; -+} -+ -+STORE(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int ret, id = opt - bch2_opt_table; -+ char *tmp; -+ u64 v; -+ -+ /* -+ * We don't need to take c->writes for correctness, but it eliminates an -+ * unsightly error message in the dmesg log when we're RO: -+ */ -+ if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) -+ return -EROFS; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); -+ kfree(tmp); -+ -+ if (ret < 0) -+ goto err; -+ -+ ret = bch2_opt_check_may_set(c, id, v); -+ if (ret < 0) -+ goto err; -+ -+ bch2_opt_set_sb(c, opt, v); -+ bch2_opt_set_by_id(&c->opts, id, v); -+ -+ if ((id == Opt_background_target || -+ id == Opt_background_compression) && v) { -+ bch2_rebalance_add_work(c, S64_MAX); -+ rebalance_wakeup(c); -+ } -+ -+ ret = size; -+err: -+ bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); -+ return ret; -+} -+SYSFS_OPS(bch2_fs_opts_dir); -+ -+struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -+ -+int bch2_opts_create_sysfs_files(struct kobject *kobj) -+{ -+ const struct bch_option *i; -+ int ret; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + bch2_opts_nr; -+ i++) { -+ if (!(i->flags & OPT_FS)) -+ continue; -+ -+ ret = sysfs_create_file(kobj, &i->attr); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* time stats */ -+ -+SHOW(bch2_fs_time_stats) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -+ -+#define x(name) \ -+ if (attr == &sysfs_time_stat_##name) \ -+ bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); -+ BCH_TIME_STATS() -+#undef x -+ -+ return 0; -+} -+ -+STORE(bch2_fs_time_stats) -+{ -+ return size; -+} -+SYSFS_OPS(bch2_fs_time_stats); -+ -+struct attribute *bch2_fs_time_stats_files[] = { -+#define x(name) \ -+ &sysfs_time_stat_##name, -+ BCH_TIME_STATS() -+#undef x -+ NULL -+}; -+ -+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ unsigned i, nr[BCH_DATA_NR]; -+ -+ memset(nr, 0, sizeof(nr)); -+ -+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) -+ nr[c->open_buckets[i].data_type]++; -+ -+ printbuf_tabstop_push(out, 8); -+ printbuf_tabstop_push(out, 16); -+ printbuf_tabstop_push(out, 16); -+ printbuf_tabstop_push(out, 16); -+ printbuf_tabstop_push(out, 16); -+ -+ prt_tab(out); -+ prt_str(out, "buckets"); -+ prt_tab_rjust(out); -+ prt_str(out, "sectors"); -+ prt_tab_rjust(out); -+ prt_str(out, "fragmented"); -+ prt_tab_rjust(out); -+ prt_newline(out); -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ prt_str(out, bch2_data_types[i]); -+ prt_tab(out); -+ prt_u64(out, stats.d[i].buckets); -+ prt_tab_rjust(out); -+ prt_u64(out, stats.d[i].sectors); -+ prt_tab_rjust(out); -+ prt_u64(out, stats.d[i].fragmented); -+ prt_tab_rjust(out); -+ prt_newline(out); -+ } -+ -+ prt_str(out, "ec"); -+ prt_tab(out); -+ prt_u64(out, stats.buckets_ec); -+ prt_tab_rjust(out); -+ prt_newline(out); -+ -+ prt_newline(out); -+ -+ prt_printf(out, "reserves:"); -+ prt_newline(out); -+ for (i = 0; i < BCH_WATERMARK_NR; i++) { -+ prt_str(out, bch2_watermarks[i]); -+ prt_tab(out); -+ prt_u64(out, bch2_dev_buckets_reserved(ca, i)); -+ prt_tab_rjust(out); -+ prt_newline(out); -+ } -+ -+ prt_newline(out); -+ -+ printbuf_tabstops_reset(out); -+ printbuf_tabstop_push(out, 24); -+ -+ prt_str(out, "freelist_wait"); -+ prt_tab(out); -+ prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); -+ prt_newline(out); -+ -+ prt_str(out, "open buckets allocated"); -+ prt_tab(out); -+ prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); -+ prt_newline(out); -+ -+ prt_str(out, "open buckets this dev"); -+ prt_tab(out); -+ prt_u64(out, ca->nr_open_buckets); -+ prt_newline(out); -+ -+ prt_str(out, "open buckets total"); -+ prt_tab(out); -+ prt_u64(out, OPEN_BUCKETS_COUNT); -+ prt_newline(out); -+ -+ prt_str(out, "open_buckets_wait"); -+ prt_tab(out); -+ prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); -+ prt_newline(out); -+ -+ prt_str(out, "open_buckets_btree"); -+ prt_tab(out); -+ prt_u64(out, nr[BCH_DATA_btree]); -+ prt_newline(out); -+ -+ prt_str(out, "open_buckets_user"); -+ prt_tab(out); -+ prt_u64(out, nr[BCH_DATA_user]); -+ prt_newline(out); -+ -+ prt_str(out, "buckets_to_invalidate"); -+ prt_tab(out); -+ prt_u64(out, should_invalidate_buckets(ca, stats)); -+ prt_newline(out); -+ -+ prt_str(out, "btree reserve cache"); -+ prt_tab(out); -+ prt_u64(out, c->btree_reserve_cache_nr); -+ prt_newline(out); -+} -+ -+static const char * const bch2_rw[] = { -+ "read", -+ "write", -+ NULL -+}; -+ -+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ int rw, i; -+ -+ for (rw = 0; rw < 2; rw++) { -+ prt_printf(out, "%s:\n", bch2_rw[rw]); -+ -+ for (i = 1; i < BCH_DATA_NR; i++) -+ prt_printf(out, "%-12s:%12llu\n", -+ bch2_data_types[i], -+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); -+ } -+} -+ -+SHOW(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ -+ sysfs_printf(uuid, "%pU\n", ca->uuid.b); -+ -+ sysfs_print(bucket_size, bucket_bytes(ca)); -+ sysfs_print(first_bucket, ca->mi.first_bucket); -+ sysfs_print(nbuckets, ca->mi.nbuckets); -+ sysfs_print(durability, ca->mi.durability); -+ sysfs_print(discard, ca->mi.discard); -+ -+ if (attr == &sysfs_label) { -+ if (ca->mi.group) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, c->disk_sb.sb, -+ ca->mi.group - 1); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ prt_char(out, '\n'); -+ } -+ -+ if (attr == &sysfs_has_data) { -+ prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); -+ prt_char(out, '\n'); -+ } -+ -+ if (attr == &sysfs_state_rw) { -+ prt_string_option(out, bch2_member_states, ca->mi.state); -+ prt_char(out, '\n'); -+ } -+ -+ if (attr == &sysfs_iodone) -+ dev_iodone_to_text(out, ca); -+ -+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); -+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); -+ -+ if (attr == &sysfs_io_latency_stats_read) -+ bch2_time_stats_to_text(out, &ca->io_latency[READ]); -+ -+ if (attr == &sysfs_io_latency_stats_write) -+ bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); -+ -+ sysfs_printf(congested, "%u%%", -+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) -+ * 100 / CONGESTED_MAX); -+ -+ if (attr == &sysfs_alloc_debug) -+ dev_alloc_debug_to_text(out, ca); -+ -+ return 0; -+} -+ -+STORE(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct bch_member *mi; -+ -+ if (attr == &sysfs_discard) { -+ bool v = strtoul_or_return(buf); -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if (v != BCH_MEMBER_DISCARD(mi)) { -+ SET_BCH_MEMBER_DISCARD(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_durability) { -+ u64 v = strtoul_or_return(buf); -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if (v != BCH_MEMBER_DURABILITY(mi)) { -+ SET_BCH_MEMBER_DURABILITY(mi, v + 1); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_label) { -+ char *tmp; -+ int ret; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_dev_group_set(c, ca, strim(tmp)); -+ kfree(tmp); -+ if (ret) -+ return ret; -+ } -+ -+ return size; -+} -+SYSFS_OPS(bch2_dev); -+ -+struct attribute *bch2_dev_files[] = { -+ &sysfs_uuid, -+ &sysfs_bucket_size, -+ &sysfs_first_bucket, -+ &sysfs_nbuckets, -+ &sysfs_durability, -+ -+ /* settings: */ -+ &sysfs_discard, -+ &sysfs_state_rw, -+ &sysfs_label, -+ -+ &sysfs_has_data, -+ &sysfs_iodone, -+ -+ &sysfs_io_latency_read, -+ &sysfs_io_latency_write, -+ &sysfs_io_latency_stats_read, -+ &sysfs_io_latency_stats_write, -+ &sysfs_congested, -+ -+ /* debug: */ -+ &sysfs_alloc_debug, -+ NULL -+}; -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h -new file mode 100644 -index 000000000..222cd5062 ---- /dev/null -+++ b/fs/bcachefs/sysfs.h -@@ -0,0 +1,48 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SYSFS_H_ -+#define _BCACHEFS_SYSFS_H_ -+ -+#include -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+struct attribute; -+struct sysfs_ops; -+ -+extern struct attribute *bch2_fs_files[]; -+extern struct attribute *bch2_fs_counters_files[]; -+extern struct attribute *bch2_fs_internal_files[]; -+extern struct attribute *bch2_fs_opts_dir_files[]; -+extern struct attribute *bch2_fs_time_stats_files[]; -+extern struct attribute *bch2_dev_files[]; -+ -+extern const struct sysfs_ops bch2_fs_sysfs_ops; -+extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; -+extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; -+extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+extern const struct sysfs_ops bch2_dev_sysfs_ops; -+ -+int bch2_opts_create_sysfs_files(struct kobject *); -+ -+#else -+ -+static struct attribute *bch2_fs_files[] = {}; -+static struct attribute *bch2_fs_counters_files[] = {}; -+static struct attribute *bch2_fs_internal_files[] = {}; -+static struct attribute *bch2_fs_opts_dir_files[] = {}; -+static struct attribute *bch2_fs_time_stats_files[] = {}; -+static struct attribute *bch2_dev_files[] = {}; -+ -+static const struct sysfs_ops bch2_fs_sysfs_ops; -+static const struct sysfs_ops bch2_fs_counters_sysfs_ops; -+static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+static const struct sysfs_ops bch2_dev_sysfs_ops; -+ -+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } -+ -+#endif /* NO_BCACHEFS_SYSFS */ -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -new file mode 100644 -index 000000000..72389c737 ---- /dev/null -+++ b/fs/bcachefs/tests.c -@@ -0,0 +1,970 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "journal_reclaim.h" -+#include "snapshot.h" -+#include "tests.h" -+ -+#include "linux/kthread.h" -+#include "linux/random.h" -+ -+static void delete_test_keys(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), -+ POS(0, U64_MAX), -+ 0, NULL); -+ BUG_ON(ret); -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ POS(0, U64_MAX), -+ 0, NULL); -+ BUG_ON(ret); -+} -+ -+/* unit tests */ -+ -+static int test_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.snapshot = U32_MAX; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, &k.k_i, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "update error"); -+ goto err; -+ } -+ -+ pr_info("deleting once"); -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_delete_at(&trans, &iter, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "delete error (first)"); -+ goto err; -+ } -+ -+ pr_info("deleting twice"); -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_delete_at(&trans, &iter, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "delete error (second)"); -+ goto err; -+ } -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_delete_written(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.snapshot = U32_MAX; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_trans_update(&trans, &iter, &k.k_i, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "update error"); -+ goto err; -+ } -+ -+ bch2_trans_unlock(&trans); -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_btree_iter_traverse(&iter) ?: -+ bch2_btree_delete_at(&trans, &iter, 0)); -+ if (ret) { -+ bch_err_msg(c, ret, "delete error"); -+ goto err; -+ } -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_iterate(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ u64 i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i; -+ k.k.p.snapshot = U32_MAX; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, -+ NULL, NULL, 0); -+ if (ret) { -+ bch_err_msg(c, ret, "insert error"); -+ goto err; -+ } -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ 0, k, ({ -+ BUG_ON(k.k->p.offset != i++); -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating forwards"); -+ goto err; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, U64_MAX, U32_MAX), 0, k, -+ ({ -+ BUG_ON(k.k->p.offset != --i); -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating backwards"); -+ goto err; -+ } -+ -+ BUG_ON(i); -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_iterate_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ u64 i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test extents"); -+ -+ for (i = 0; i < nr; i += 8) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 8; -+ k.k.p.snapshot = U32_MAX; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, -+ NULL, NULL, 0); -+ if (ret) { -+ bch_err_msg(c, ret, "insert error"); -+ goto err; -+ } -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ 0, k, ({ -+ BUG_ON(bkey_start_offset(k.k) != i); -+ i = k.k->p.offset; -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating forwards"); -+ goto err; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, -+ SPOS(0, U64_MAX, U32_MAX), 0, k, -+ ({ -+ BUG_ON(k.k->p.offset != i); -+ i = bkey_start_offset(k.k); -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating backwards"); -+ goto err; -+ } -+ -+ BUG_ON(i); -+err: -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_iterate_slots(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ u64 i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i * 2; -+ k.k.p.snapshot = U32_MAX; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, -+ NULL, NULL, 0); -+ if (ret) { -+ bch_err_msg(c, ret, "insert error"); -+ goto err; -+ } -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ 0, k, ({ -+ BUG_ON(k.k->p.offset != i); -+ i += 2; -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating forwards"); -+ goto err; -+ } -+ -+ BUG_ON(i != nr * 2); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ BTREE_ITER_SLOTS, k, ({ -+ if (i >= nr * 2) -+ break; -+ -+ BUG_ON(k.k->p.offset != i); -+ BUG_ON(bkey_deleted(k.k) != (i & 1)); -+ -+ i++; -+ 0; -+ })); -+ if (ret < 0) { -+ bch_err_msg(c, ret, "error iterating forwards by slots"); -+ goto err; -+ } -+ ret = 0; -+err: -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter = { NULL }; -+ struct bkey_s_c k; -+ u64 i; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i += 16) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 16; -+ k.k.p.snapshot = U32_MAX; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, -+ NULL, NULL, 0); -+ if (ret) { -+ bch_err_msg(c, ret, "insert error"); -+ goto err; -+ } -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ 0, k, ({ -+ BUG_ON(bkey_start_offset(k.k) != i + 8); -+ BUG_ON(k.k->size != 8); -+ i += 16; -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating forwards"); -+ goto err; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ BTREE_ITER_SLOTS, k, ({ -+ if (i == nr) -+ break; -+ BUG_ON(bkey_deleted(k.k) != !(i % 16)); -+ -+ BUG_ON(bkey_start_offset(k.k) != i); -+ BUG_ON(k.k->size != 8); -+ i = k.k->p.offset; -+ 0; -+ })); -+ if (ret) { -+ bch_err_msg(c, ret, "error iterating forwards by slots"); -+ goto err; -+ } -+ ret = 0; -+err: -+ bch2_trans_exit(&trans); -+ return 0; -+} -+ -+/* -+ * XXX: we really want to make sure we've got a btree with depth > 0 for these -+ * tests -+ */ -+static int test_peek_end(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0); -+ -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ BUG_ON(k.k); -+ -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ BUG_ON(k.k); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return 0; -+} -+ -+static int test_peek_end_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, -+ SPOS(0, 0, U32_MAX), 0); -+ -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ BUG_ON(k.k); -+ -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ BUG_ON(k.k); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return 0; -+} -+ -+/* extent unit tests */ -+ -+static u64 test_version; -+ -+static int insert_test_extent(struct bch_fs *c, -+ u64 start, u64 end) -+{ -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k_i.k.p.offset = end; -+ k.k_i.k.p.snapshot = U32_MAX; -+ k.k_i.k.size = end - start; -+ k.k_i.k.version.lo = test_version++; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, -+ NULL, NULL, 0); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int __test_extent_overwrite(struct bch_fs *c, -+ u64 e1_start, u64 e1_end, -+ u64 e2_start, u64 e2_end) -+{ -+ int ret; -+ -+ ret = insert_test_extent(c, e1_start, e1_end) ?: -+ insert_test_extent(c, e2_start, e2_end); -+ -+ delete_test_keys(c); -+ return ret; -+} -+ -+static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) -+{ -+ return __test_extent_overwrite(c, 0, 64, 0, 32) ?: -+ __test_extent_overwrite(c, 8, 64, 0, 32); -+} -+ -+static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) -+{ -+ return __test_extent_overwrite(c, 0, 64, 32, 64) ?: -+ __test_extent_overwrite(c, 0, 64, 32, 72); -+} -+ -+static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -+{ -+ return __test_extent_overwrite(c, 0, 64, 32, 40); -+} -+ -+static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) -+{ -+ return __test_extent_overwrite(c, 32, 64, 0, 64) ?: -+ __test_extent_overwrite(c, 32, 64, 0, 128) ?: -+ __test_extent_overwrite(c, 32, 64, 32, 64) ?: -+ __test_extent_overwrite(c, 32, 64, 32, 128); -+} -+ -+static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start, u32 len, u32 snapid) -+{ -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k_i.k.p.inode = inum; -+ k.k_i.k.p.offset = start + len; -+ k.k_i.k.p.snapshot = snapid; -+ k.k_i.k.size = len; -+ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_btree_insert_nonextent(&trans, BTREE_ID_extents, &k.k_i, -+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)); -+ if (ret) -+ bch_err_fn(c, ret); -+ return ret; -+} -+ -+static int test_extent_create_overlapping(struct bch_fs *c, u64 inum) -+{ -+ return insert_test_overlapping_extent(c, inum, 0, 16, U32_MAX - 2) ?: /* overwrite entire */ -+ insert_test_overlapping_extent(c, inum, 2, 8, U32_MAX - 2) ?: -+ insert_test_overlapping_extent(c, inum, 4, 4, U32_MAX) ?: -+ insert_test_overlapping_extent(c, inum, 32, 8, U32_MAX - 2) ?: /* overwrite front/back */ -+ insert_test_overlapping_extent(c, inum, 36, 8, U32_MAX) ?: -+ insert_test_overlapping_extent(c, inum, 60, 8, U32_MAX - 2) ?: -+ insert_test_overlapping_extent(c, inum, 64, 8, U32_MAX); -+} -+ -+/* snapshot unit tests */ -+ -+/* Test skipping over keys in unrelated snapshots: */ -+static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_i_cookie cookie; -+ int ret; -+ -+ bkey_cookie_init(&cookie.k_i); -+ cookie.k.p.snapshot = snapid_hi; -+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, -+ NULL, NULL, 0); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, -+ SPOS(0, 0, snapid_lo), 0); -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); -+ -+ BUG_ON(k.k->p.snapshot != U32_MAX); -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int test_snapshots(struct bch_fs *c, u64 nr) -+{ -+ struct bkey_i_cookie cookie; -+ u32 snapids[2]; -+ u32 snapid_subvols[2] = { 1, 1 }; -+ int ret; -+ -+ bkey_cookie_init(&cookie.k_i); -+ cookie.k.p.snapshot = U32_MAX; -+ ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, -+ NULL, NULL, 0); -+ if (ret) -+ return ret; -+ -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_snapshot_node_create(&trans, U32_MAX, -+ snapids, -+ snapid_subvols, -+ 2)); -+ if (ret) -+ return ret; -+ -+ if (snapids[0] > snapids[1]) -+ swap(snapids[0], snapids[1]); -+ -+ ret = test_snapshot_filter(c, snapids[0], snapids[1]); -+ if (ret) { -+ bch_err_msg(c, ret, "from test_snapshot_filter"); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* perf tests */ -+ -+static u64 test_rand(void) -+{ -+ u64 v; -+ -+ get_random_bytes(&v, sizeof(v)); -+ return v; -+} -+ -+static int rand_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct bkey_i_cookie k; -+ int ret = 0; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = test_rand(); -+ k.k.p.snapshot = U32_MAX; -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0)); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int rand_insert_multi(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct bkey_i_cookie k[8]; -+ int ret = 0; -+ unsigned j; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i += ARRAY_SIZE(k)) { -+ for (j = 0; j < ARRAY_SIZE(k); j++) { -+ bkey_cookie_init(&k[j].k_i); -+ k[j].k.p.offset = test_rand(); -+ k[j].k.p.snapshot = U32_MAX; -+ } -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: -+ __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0)); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int rand_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0); -+ -+ for (i = 0; i < nr; i++) { -+ bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); -+ -+ lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int rand_mixed_trans(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i_cookie *cookie, -+ u64 i, u64 pos) -+{ -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); -+ -+ k = bch2_btree_iter_peek(iter); -+ ret = bkey_err(k); -+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ bch_err_msg(trans->c, ret, "lookup error"); -+ if (ret) -+ return ret; -+ -+ if (!(i & 3) && k.k) { -+ bkey_cookie_init(&cookie->k_i); -+ cookie->k.p = iter->pos; -+ ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); -+ } -+ -+ return ret; -+} -+ -+static int rand_mixed(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_i_cookie cookie; -+ int ret = 0; -+ u64 i, rand; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), 0); -+ -+ for (i = 0; i < nr; i++) { -+ rand = test_rand(); -+ ret = commit_do(&trans, NULL, NULL, 0, -+ rand_mixed_trans(&trans, &iter, &cookie, i, rand)); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_iter_exit(&trans, &iter); -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int __do_delete(struct btree_trans *trans, struct bpos pos) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, -+ BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (!k.k) -+ goto err; -+ -+ ret = bch2_btree_delete_at(trans, &iter, 0); -+err: -+ bch2_trans_iter_exit(trans, &iter); -+ return ret; -+} -+ -+static int rand_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ int ret = 0; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ struct bpos pos = SPOS(0, test_rand(), U32_MAX); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ __do_delete(&trans, pos)); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+static int seq_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bkey_i_cookie insert; -+ -+ bkey_cookie_init(&insert.k_i); -+ -+ return bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, -+ NULL, NULL, 0, ({ -+ if (iter.pos.offset >= nr) -+ break; -+ insert.k.p = iter.pos; -+ bch2_trans_update(&trans, &iter, &insert.k_i, 0); -+ }))); -+} -+ -+static int seq_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ -+ return bch2_trans_run(c, -+ for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX), -+ 0, k, -+ 0)); -+} -+ -+static int seq_overwrite(struct bch_fs *c, u64 nr) -+{ -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ -+ return bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ BTREE_ITER_INTENT, k, -+ NULL, NULL, 0, ({ -+ struct bkey_i_cookie u; -+ -+ bkey_reassemble(&u.k_i, k); -+ bch2_trans_update(&trans, &iter, &u.k_i, 0); -+ }))); -+} -+ -+static int seq_delete(struct bch_fs *c, u64 nr) -+{ -+ return bch2_btree_delete_range(c, BTREE_ID_xattrs, -+ SPOS(0, 0, U32_MAX), -+ POS(0, U64_MAX), -+ 0, NULL); -+} -+ -+typedef int (*perf_test_fn)(struct bch_fs *, u64); -+ -+struct test_job { -+ struct bch_fs *c; -+ u64 nr; -+ unsigned nr_threads; -+ perf_test_fn fn; -+ -+ atomic_t ready; -+ wait_queue_head_t ready_wait; -+ -+ atomic_t done; -+ struct completion done_completion; -+ -+ u64 start; -+ u64 finish; -+ int ret; -+}; -+ -+static int btree_perf_test_thread(void *data) -+{ -+ struct test_job *j = data; -+ int ret; -+ -+ if (atomic_dec_and_test(&j->ready)) { -+ wake_up(&j->ready_wait); -+ j->start = sched_clock(); -+ } else { -+ wait_event(j->ready_wait, !atomic_read(&j->ready)); -+ } -+ -+ ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); -+ if (ret) { -+ bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); -+ j->ret = ret; -+ } -+ -+ if (atomic_dec_and_test(&j->done)) { -+ j->finish = sched_clock(); -+ complete(&j->done_completion); -+ } -+ -+ return 0; -+} -+ -+int bch2_btree_perf_test(struct bch_fs *c, const char *testname, -+ u64 nr, unsigned nr_threads) -+{ -+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; -+ char name_buf[20]; -+ struct printbuf nr_buf = PRINTBUF; -+ struct printbuf per_sec_buf = PRINTBUF; -+ unsigned i; -+ u64 time; -+ -+ atomic_set(&j.ready, nr_threads); -+ init_waitqueue_head(&j.ready_wait); -+ -+ atomic_set(&j.done, nr_threads); -+ init_completion(&j.done_completion); -+ -+#define perf_test(_test) \ -+ if (!strcmp(testname, #_test)) j.fn = _test -+ -+ perf_test(rand_insert); -+ perf_test(rand_insert_multi); -+ perf_test(rand_lookup); -+ perf_test(rand_mixed); -+ perf_test(rand_delete); -+ -+ perf_test(seq_insert); -+ perf_test(seq_lookup); -+ perf_test(seq_overwrite); -+ perf_test(seq_delete); -+ -+ /* a unit test, not a perf test: */ -+ perf_test(test_delete); -+ perf_test(test_delete_written); -+ perf_test(test_iterate); -+ perf_test(test_iterate_extents); -+ perf_test(test_iterate_slots); -+ perf_test(test_iterate_slots_extents); -+ perf_test(test_peek_end); -+ perf_test(test_peek_end_extents); -+ -+ perf_test(test_extent_overwrite_front); -+ perf_test(test_extent_overwrite_back); -+ perf_test(test_extent_overwrite_middle); -+ perf_test(test_extent_overwrite_all); -+ perf_test(test_extent_create_overlapping); -+ -+ perf_test(test_snapshots); -+ -+ if (!j.fn) { -+ pr_err("unknown test %s", testname); -+ return -EINVAL; -+ } -+ -+ //pr_info("running test %s:", testname); -+ -+ if (nr_threads == 1) -+ btree_perf_test_thread(&j); -+ else -+ for (i = 0; i < nr_threads; i++) -+ kthread_run(btree_perf_test_thread, &j, -+ "bcachefs perf test[%u]", i); -+ -+ while (wait_for_completion_interruptible(&j.done_completion)) -+ ; -+ -+ time = j.finish - j.start; -+ -+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); -+ prt_human_readable_u64(&nr_buf, nr); -+ prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); -+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", -+ name_buf, nr_buf.buf, nr_threads, -+ div_u64(time, NSEC_PER_SEC), -+ div_u64(time * nr_threads, nr), -+ per_sec_buf.buf); -+ printbuf_exit(&per_sec_buf); -+ printbuf_exit(&nr_buf); -+ return j.ret; -+} -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h -new file mode 100644 -index 000000000..c73b18aea ---- /dev/null -+++ b/fs/bcachefs/tests.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_TEST_H -+#define _BCACHEFS_TEST_H -+ -+struct bch_fs; -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); -+ -+#else -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#endif /* _BCACHEFS_TEST_H */ -diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c -new file mode 100644 -index 000000000..33efa6005 ---- /dev/null -+++ b/fs/bcachefs/trace.c -@@ -0,0 +1,16 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "buckets.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update_interior.h" -+#include "keylist.h" -+#include "opts.h" -+#include "six.h" -+ -+#include -+ -+#define CREATE_TRACE_POINTS -+#include "trace.h" -diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h -new file mode 100644 -index 000000000..97fe77423 ---- /dev/null -+++ b/fs/bcachefs/trace.h -@@ -0,0 +1,1265 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM bcachefs -+ -+#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_BCACHEFS_H -+ -+#include -+ -+#define TRACE_BPOS_entries(name) \ -+ __field(u64, name##_inode ) \ -+ __field(u64, name##_offset ) \ -+ __field(u32, name##_snapshot ) -+ -+#define TRACE_BPOS_assign(dst, src) \ -+ __entry->dst##_inode = (src).inode; \ -+ __entry->dst##_offset = (src).offset; \ -+ __entry->dst##_snapshot = (src).snapshot -+ -+DECLARE_EVENT_CLASS(bpos, -+ TP_PROTO(const struct bpos *p), -+ TP_ARGS(p), -+ -+ TP_STRUCT__entry( -+ TRACE_BPOS_entries(p) -+ ), -+ -+ TP_fast_assign( -+ TRACE_BPOS_assign(p, *p); -+ ), -+ -+ TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) -+); -+ -+DECLARE_EVENT_CLASS(bkey, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k), -+ -+ TP_STRUCT__entry( -+ __string(k, k ) -+ ), -+ -+ TP_fast_assign( -+ __assign_str(k, k); -+ ), -+ -+ TP_printk("%s", __get_str(k)) -+); -+ -+DECLARE_EVENT_CLASS(btree_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u8, level ) -+ __field(u8, btree_id ) -+ TRACE_BPOS_entries(pos) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->level = b->c.level; -+ __entry->btree_id = b->c.btree_id; -+ TRACE_BPOS_assign(pos, b->key.k.p); -+ ), -+ -+ TP_printk("%d,%d %u %s %llu:%llu:%u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->level, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) -+); -+ -+DECLARE_EVENT_CLASS(bch_fs, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ ), -+ -+ TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) -+); -+ -+DECLARE_EVENT_CLASS(bio, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(sector_t, sector ) -+ __field(unsigned int, nr_sector ) -+ __array(char, rwbs, 6 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; -+ __entry->sector = bio->bi_iter.bi_sector; -+ __entry->nr_sector = bio->bi_iter.bi_size >> 9; -+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf); -+ ), -+ -+ TP_printk("%d,%d %s %llu + %u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, -+ (unsigned long long)__entry->sector, __entry->nr_sector) -+); -+ -+/* super-io.c: */ -+TRACE_EVENT(write_super, -+ TP_PROTO(struct bch_fs *c, unsigned long ip), -+ TP_ARGS(c, ip), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%d,%d for %pS", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ (void *) __entry->ip) -+); -+ -+/* io.c: */ -+ -+DEFINE_EVENT(bio, read_promote, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_bounce, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_split, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_retry, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_reuse_race, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* Journal */ -+ -+DEFINE_EVENT(bch_fs, journal_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, journal_entry_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bio, journal_write, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+TRACE_EVENT(journal_reclaim_start, -+ TP_PROTO(struct bch_fs *c, bool direct, bool kicked, -+ u64 min_nr, u64 min_key_cache, -+ u64 prereserved, u64 prereserved_total, -+ u64 btree_cache_dirty, u64 btree_cache_total, -+ u64 btree_key_cache_dirty, u64 btree_key_cache_total), -+ TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, -+ btree_cache_dirty, btree_cache_total, -+ btree_key_cache_dirty, btree_key_cache_total), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(bool, direct ) -+ __field(bool, kicked ) -+ __field(u64, min_nr ) -+ __field(u64, min_key_cache ) -+ __field(u64, prereserved ) -+ __field(u64, prereserved_total ) -+ __field(u64, btree_cache_dirty ) -+ __field(u64, btree_cache_total ) -+ __field(u64, btree_key_cache_dirty ) -+ __field(u64, btree_key_cache_total ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->direct = direct; -+ __entry->kicked = kicked; -+ __entry->min_nr = min_nr; -+ __entry->min_key_cache = min_key_cache; -+ __entry->prereserved = prereserved; -+ __entry->prereserved_total = prereserved_total; -+ __entry->btree_cache_dirty = btree_cache_dirty; -+ __entry->btree_cache_total = btree_cache_total; -+ __entry->btree_key_cache_dirty = btree_key_cache_dirty; -+ __entry->btree_key_cache_total = btree_key_cache_total; -+ ), -+ -+ TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->direct, -+ __entry->kicked, -+ __entry->min_nr, -+ __entry->min_key_cache, -+ __entry->prereserved, -+ __entry->prereserved_total, -+ __entry->btree_cache_dirty, -+ __entry->btree_cache_total, -+ __entry->btree_key_cache_dirty, -+ __entry->btree_key_cache_total) -+); -+ -+TRACE_EVENT(journal_reclaim_finish, -+ TP_PROTO(struct bch_fs *c, u64 nr_flushed), -+ TP_ARGS(c, nr_flushed), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, nr_flushed ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->nr_flushed = nr_flushed; -+ ), -+ -+ TP_printk("%d,%d flushed %llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->nr_flushed) -+); -+ -+/* bset.c: */ -+ -+DEFINE_EVENT(bpos, bkey_pack_pos_fail, -+ TP_PROTO(const struct bpos *p), -+ TP_ARGS(p) -+); -+ -+/* Btree cache: */ -+ -+TRACE_EVENT(btree_cache_scan, -+ TP_PROTO(long nr_to_scan, long can_free, long ret), -+ TP_ARGS(nr_to_scan, can_free, ret), -+ -+ TP_STRUCT__entry( -+ __field(long, nr_to_scan ) -+ __field(long, can_free ) -+ __field(long, ret ) -+ ), -+ -+ TP_fast_assign( -+ __entry->nr_to_scan = nr_to_scan; -+ __entry->can_free = can_free; -+ __entry->ret = ret; -+ ), -+ -+ TP_printk("scanned for %li nodes, can free %li, ret %li", -+ __entry->nr_to_scan, __entry->can_free, __entry->ret) -+); -+ -+DEFINE_EVENT(btree_node, btree_cache_reap, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_cache_cannibalize, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+/* Btree */ -+ -+DEFINE_EVENT(btree_node, btree_node_read, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_node_write, -+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), -+ TP_ARGS(b, bytes, sectors), -+ -+ TP_STRUCT__entry( -+ __field(enum btree_node_type, type) -+ __field(unsigned, bytes ) -+ __field(unsigned, sectors ) -+ ), -+ -+ TP_fast_assign( -+ __entry->type = btree_node_type(b); -+ __entry->bytes = bytes; -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("bkey type %u bytes %u sectors %u", -+ __entry->type , __entry->bytes, __entry->sectors) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_alloc, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_free, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_reserve_get_fail, -+ TP_PROTO(const char *trans_fn, -+ unsigned long caller_ip, -+ size_t required, -+ int ret), -+ TP_ARGS(trans_fn, caller_ip, required, ret), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(size_t, required ) -+ __array(char, ret, 32 ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->required = required; -+ strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); -+ ), -+ -+ TP_printk("%s %pS required %zu ret %s", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ __entry->required, -+ __entry->ret) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_compact, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_merge, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_split, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_rewrite, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_set_root, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_path_relock_fail, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path, -+ unsigned level), -+ TP_ARGS(trans, caller_ip, path, level), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(u8, btree_id ) -+ __field(u8, level ) -+ TRACE_BPOS_entries(pos) -+ __array(char, node, 24 ) -+ __field(u8, self_read_count ) -+ __field(u8, self_intent_count) -+ __field(u8, read_count ) -+ __field(u8, intent_count ) -+ __field(u32, iter_lock_seq ) -+ __field(u32, node_lock_seq ) -+ ), -+ -+ TP_fast_assign( -+ struct btree *b = btree_path_node(path, level); -+ struct six_lock_count c; -+ -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->btree_id = path->btree_id; -+ __entry->level = path->level; -+ TRACE_BPOS_assign(pos, path->pos); -+ -+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), -+ __entry->self_read_count = c.n[SIX_LOCK_read]; -+ __entry->self_intent_count = c.n[SIX_LOCK_intent]; -+ -+ if (IS_ERR(b)) { -+ strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); -+ } else { -+ c = six_lock_counts(&path->l[level].b->c.lock); -+ __entry->read_count = c.n[SIX_LOCK_read]; -+ __entry->intent_count = c.n[SIX_LOCK_intent]; -+ scnprintf(__entry->node, sizeof(__entry->node), "%px", b); -+ } -+ __entry->iter_lock_seq = path->l[level].lock_seq; -+ __entry->node_lock_seq = is_btree_node(path, level) -+ ? six_lock_seq(&path->l[level].b->c.lock) -+ : 0; -+ ), -+ -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s held %u:%u lock count %u:%u iter seq %u lock seq %u", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, -+ __entry->pos_offset, -+ __entry->pos_snapshot, -+ __entry->level, -+ __entry->node, -+ __entry->self_read_count, -+ __entry->self_intent_count, -+ __entry->read_count, -+ __entry->intent_count, -+ __entry->iter_lock_seq, -+ __entry->node_lock_seq) -+); -+ -+TRACE_EVENT(btree_path_upgrade_fail, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path, -+ unsigned level), -+ TP_ARGS(trans, caller_ip, path, level), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(u8, btree_id ) -+ __field(u8, level ) -+ TRACE_BPOS_entries(pos) -+ __field(u8, locked ) -+ __field(u8, self_read_count ) -+ __field(u8, self_intent_count) -+ __field(u8, read_count ) -+ __field(u8, intent_count ) -+ __field(u32, iter_lock_seq ) -+ __field(u32, node_lock_seq ) -+ ), -+ -+ TP_fast_assign( -+ struct six_lock_count c; -+ -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->btree_id = path->btree_id; -+ __entry->level = level; -+ TRACE_BPOS_assign(pos, path->pos); -+ __entry->locked = btree_node_locked(path, level); -+ -+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), -+ __entry->self_read_count = c.n[SIX_LOCK_read]; -+ __entry->self_intent_count = c.n[SIX_LOCK_intent]; -+ c = six_lock_counts(&path->l[level].b->c.lock); -+ __entry->read_count = c.n[SIX_LOCK_read]; -+ __entry->intent_count = c.n[SIX_LOCK_intent]; -+ __entry->iter_lock_seq = path->l[level].lock_seq; -+ __entry->node_lock_seq = is_btree_node(path, level) -+ ? six_lock_seq(&path->l[level].b->c.lock) -+ : 0; -+ ), -+ -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, -+ __entry->pos_offset, -+ __entry->pos_snapshot, -+ __entry->level, -+ __entry->locked, -+ __entry->self_read_count, -+ __entry->self_intent_count, -+ __entry->read_count, -+ __entry->intent_count, -+ __entry->iter_lock_seq, -+ __entry->node_lock_seq) -+); -+ -+/* Garbage collection */ -+ -+DEFINE_EVENT(bch_fs, gc_gens_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_gens_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+/* Allocator */ -+ -+DECLARE_EVENT_CLASS(bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, -+ u64 bucket, -+ u64 free, -+ u64 avail, -+ u64 copygc_wait_amount, -+ s64 copygc_waiting_for, -+ struct bucket_alloc_state *s, -+ bool nonblocking, -+ const char *err), -+ TP_ARGS(ca, alloc_reserve, bucket, free, avail, -+ copygc_wait_amount, copygc_waiting_for, -+ s, nonblocking, err), -+ -+ TP_STRUCT__entry( -+ __field(u8, dev ) -+ __array(char, reserve, 16 ) -+ __field(u64, bucket ) -+ __field(u64, free ) -+ __field(u64, avail ) -+ __field(u64, copygc_wait_amount ) -+ __field(s64, copygc_waiting_for ) -+ __field(u64, seen ) -+ __field(u64, open ) -+ __field(u64, need_journal_commit ) -+ __field(u64, nouse ) -+ __field(bool, nonblocking ) -+ __field(u64, nocow ) -+ __array(char, err, 32 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = ca->dev_idx; -+ strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); -+ __entry->bucket = bucket; -+ __entry->free = free; -+ __entry->avail = avail; -+ __entry->copygc_wait_amount = copygc_wait_amount; -+ __entry->copygc_waiting_for = copygc_waiting_for; -+ __entry->seen = s->buckets_seen; -+ __entry->open = s->skipped_open; -+ __entry->need_journal_commit = s->skipped_need_journal_commit; -+ __entry->nouse = s->skipped_nouse; -+ __entry->nonblocking = nonblocking; -+ __entry->nocow = s->skipped_nocow; -+ strscpy(__entry->err, err, sizeof(__entry->err)); -+ ), -+ -+ TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", -+ __entry->reserve, -+ __entry->dev, -+ __entry->bucket, -+ __entry->free, -+ __entry->avail, -+ __entry->copygc_wait_amount, -+ __entry->copygc_waiting_for, -+ __entry->seen, -+ __entry->open, -+ __entry->need_journal_commit, -+ __entry->nouse, -+ __entry->nocow, -+ __entry->nonblocking, -+ __entry->err) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, -+ u64 bucket, -+ u64 free, -+ u64 avail, -+ u64 copygc_wait_amount, -+ s64 copygc_waiting_for, -+ struct bucket_alloc_state *s, -+ bool nonblocking, -+ const char *err), -+ TP_ARGS(ca, alloc_reserve, bucket, free, avail, -+ copygc_wait_amount, copygc_waiting_for, -+ s, nonblocking, err) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, -+ u64 bucket, -+ u64 free, -+ u64 avail, -+ u64 copygc_wait_amount, -+ s64 copygc_waiting_for, -+ struct bucket_alloc_state *s, -+ bool nonblocking, -+ const char *err), -+ TP_ARGS(ca, alloc_reserve, bucket, free, avail, -+ copygc_wait_amount, copygc_waiting_for, -+ s, nonblocking, err) -+); -+ -+TRACE_EVENT(discard_buckets, -+ TP_PROTO(struct bch_fs *c, u64 seen, u64 open, -+ u64 need_journal_commit, u64 discarded, const char *err), -+ TP_ARGS(c, seen, open, need_journal_commit, discarded, err), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, seen ) -+ __field(u64, open ) -+ __field(u64, need_journal_commit ) -+ __field(u64, discarded ) -+ __array(char, err, 16 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->seen = seen; -+ __entry->open = open; -+ __entry->need_journal_commit = need_journal_commit; -+ __entry->discarded = discarded; -+ strscpy(__entry->err, err, sizeof(__entry->err)); -+ ), -+ -+ TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->seen, -+ __entry->open, -+ __entry->need_journal_commit, -+ __entry->discarded, -+ __entry->err) -+); -+ -+TRACE_EVENT(bucket_invalidate, -+ TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), -+ TP_ARGS(c, dev, bucket, sectors), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u32, dev_idx ) -+ __field(u32, sectors ) -+ __field(u64, bucket ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->dev_idx = dev; -+ __entry->sectors = sectors; -+ __entry->bucket = bucket; -+ ), -+ -+ TP_printk("%d:%d invalidated %u:%llu cached sectors %u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->dev_idx, __entry->bucket, -+ __entry->sectors) -+); -+ -+/* Moving IO */ -+ -+TRACE_EVENT(bucket_evacuate, -+ TP_PROTO(struct bch_fs *c, struct bpos *bucket), -+ TP_ARGS(c, bucket), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u32, dev_idx ) -+ __field(u64, bucket ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->dev_idx = bucket->inode; -+ __entry->bucket = bucket->offset; -+ ), -+ -+ TP_printk("%d:%d %u:%llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->dev_idx, __entry->bucket) -+); -+ -+DEFINE_EVENT(bkey, move_extent, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k) -+); -+ -+DEFINE_EVENT(bkey, move_extent_read, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k) -+); -+ -+DEFINE_EVENT(bkey, move_extent_write, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k) -+); -+ -+DEFINE_EVENT(bkey, move_extent_finish, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k) -+); -+ -+TRACE_EVENT(move_extent_fail, -+ TP_PROTO(struct bch_fs *c, const char *msg), -+ TP_ARGS(c, msg), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __string(msg, msg ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __assign_str(msg, msg); -+ ), -+ -+ TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) -+); -+ -+DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, -+ TP_PROTO(struct bch_fs *c, const char *k), -+ TP_ARGS(c, k) -+); -+ -+TRACE_EVENT(move_data, -+ TP_PROTO(struct bch_fs *c, u64 sectors_moved, -+ u64 keys_moved), -+ TP_ARGS(c, sectors_moved, keys_moved), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, sectors_moved ) -+ __field(u64, keys_moved ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->sectors_moved = sectors_moved; -+ __entry->keys_moved = keys_moved; -+ ), -+ -+ TP_printk("%d,%d sectors_moved %llu keys_moved %llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->sectors_moved, __entry->keys_moved) -+); -+ -+TRACE_EVENT(evacuate_bucket, -+ TP_PROTO(struct bch_fs *c, struct bpos *bucket, -+ unsigned sectors, unsigned bucket_size, -+ u64 fragmentation, int ret), -+ TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, member ) -+ __field(u64, bucket ) -+ __field(u32, sectors ) -+ __field(u32, bucket_size ) -+ __field(u64, fragmentation ) -+ __field(int, ret ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->member = bucket->inode; -+ __entry->bucket = bucket->offset; -+ __entry->sectors = sectors; -+ __entry->bucket_size = bucket_size; -+ __entry->fragmentation = fragmentation; -+ __entry->ret = ret; -+ ), -+ -+ TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->member, __entry->bucket, -+ __entry->sectors, __entry->bucket_size, -+ __entry->fragmentation, __entry->ret) -+); -+ -+TRACE_EVENT(copygc, -+ TP_PROTO(struct bch_fs *c, -+ u64 sectors_moved, u64 sectors_not_moved, -+ u64 buckets_moved, u64 buckets_not_moved), -+ TP_ARGS(c, -+ sectors_moved, sectors_not_moved, -+ buckets_moved, buckets_not_moved), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, sectors_moved ) -+ __field(u64, sectors_not_moved ) -+ __field(u64, buckets_moved ) -+ __field(u64, buckets_not_moved ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->sectors_moved = sectors_moved; -+ __entry->sectors_not_moved = sectors_not_moved; -+ __entry->buckets_moved = buckets_moved; -+ __entry->buckets_not_moved = buckets_moved; -+ ), -+ -+ TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->sectors_moved, __entry->sectors_not_moved, -+ __entry->buckets_moved, __entry->buckets_not_moved) -+); -+ -+TRACE_EVENT(copygc_wait, -+ TP_PROTO(struct bch_fs *c, -+ u64 wait_amount, u64 until), -+ TP_ARGS(c, wait_amount, until), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(u64, wait_amount ) -+ __field(u64, until ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = c->dev; -+ __entry->wait_amount = wait_amount; -+ __entry->until = until; -+ ), -+ -+ TP_printk("%d,%u waiting for %llu sectors until %llu", -+ MAJOR(__entry->dev), MINOR(__entry->dev), -+ __entry->wait_amount, __entry->until) -+); -+ -+/* btree transactions: */ -+ -+DECLARE_EVENT_CLASS(transaction_event, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ ), -+ -+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, transaction_commit, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_injected, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+TRACE_EVENT(trans_restart_split_race, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree *b), -+ TP_ARGS(trans, caller_ip, b), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(u8, level ) -+ __field(u16, written ) -+ __field(u16, blocks ) -+ __field(u16, u64s_remaining ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->level = b->c.level; -+ __entry->written = b->written; -+ __entry->blocks = btree_blocks(trans->c); -+ __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); -+ ), -+ -+ TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", -+ __entry->trans_fn, (void *) __entry->caller_ip, -+ __entry->level, -+ __entry->written, __entry->blocks, -+ __entry->u64s_remaining) -+); -+ -+DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+TRACE_EVENT(trans_restart_journal_preres_get, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ unsigned flags), -+ TP_ARGS(trans, caller_ip, flags), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(unsigned, flags ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->flags = flags; -+ ), -+ -+ TP_printk("%s %pS %x", __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ __entry->flags) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_fault_inject, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, trans_traverse_all, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DECLARE_EVENT_CLASS(transaction_restart_iter, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(u8, btree_id ) -+ TRACE_BPOS_entries(pos) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->btree_id = path->btree_id; -+ TRACE_BPOS_assign(pos, path->pos) -+ ), -+ -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, -+ __entry->pos_offset, -+ __entry->pos_snapshot) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+TRACE_EVENT(trans_restart_upgrade, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path, -+ unsigned old_locks_want, -+ unsigned new_locks_want), -+ TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(u8, btree_id ) -+ __field(u8, old_locks_want ) -+ __field(u8, new_locks_want ) -+ TRACE_BPOS_entries(pos) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->btree_id = path->btree_id; -+ __entry->old_locks_want = old_locks_want; -+ __entry->new_locks_want = new_locks_want; -+ TRACE_BPOS_assign(pos, path->pos) -+ ), -+ -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, -+ __entry->pos_offset, -+ __entry->pos_snapshot, -+ __entry->old_locks_want, -+ __entry->new_locks_want) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path), -+ TP_ARGS(trans, caller_ip, path) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+TRACE_EVENT(trans_restart_would_deadlock_write, -+ TP_PROTO(struct btree_trans *trans), -+ TP_ARGS(trans), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ ), -+ -+ TP_printk("%s", __entry->trans_fn) -+); -+ -+TRACE_EVENT(trans_restart_mem_realloced, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ unsigned long bytes), -+ TP_ARGS(trans, caller_ip, bytes), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(unsigned long, bytes ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ __entry->bytes = bytes; -+ ), -+ -+ TP_printk("%s %pS bytes %lu", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ __entry->bytes) -+); -+ -+TRACE_EVENT(trans_restart_key_cache_key_realloced, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip, -+ struct btree_path *path, -+ unsigned old_u64s, -+ unsigned new_u64s), -+ TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), -+ -+ TP_STRUCT__entry( -+ __array(char, trans_fn, 32 ) -+ __field(unsigned long, caller_ip ) -+ __field(enum btree_id, btree_id ) -+ TRACE_BPOS_entries(pos) -+ __field(u32, old_u64s ) -+ __field(u32, new_u64s ) -+ ), -+ -+ TP_fast_assign( -+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); -+ __entry->caller_ip = caller_ip; -+ -+ __entry->btree_id = path->btree_id; -+ TRACE_BPOS_assign(pos, path->pos); -+ __entry->old_u64s = old_u64s; -+ __entry->new_u64s = new_u64s; -+ ), -+ -+ TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", -+ __entry->trans_fn, -+ (void *) __entry->caller_ip, -+ bch2_btree_ids[__entry->btree_id], -+ __entry->pos_inode, -+ __entry->pos_offset, -+ __entry->pos_snapshot, -+ __entry->old_u64s, -+ __entry->new_u64s) -+); -+ -+DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, -+ TP_PROTO(struct btree_trans *trans, -+ unsigned long caller_ip), -+ TP_ARGS(trans, caller_ip) -+); -+ -+TRACE_EVENT(write_buffer_flush, -+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), -+ TP_ARGS(trans, nr, skipped, fast, size), -+ -+ TP_STRUCT__entry( -+ __field(size_t, nr ) -+ __field(size_t, skipped ) -+ __field(size_t, fast ) -+ __field(size_t, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->nr = nr; -+ __entry->skipped = skipped; -+ __entry->fast = fast; -+ __entry->size = size; -+ ), -+ -+ TP_printk("%zu/%zu skipped %zu fast %zu", -+ __entry->nr, __entry->size, __entry->skipped, __entry->fast) -+); -+ -+TRACE_EVENT(write_buffer_flush_slowpath, -+ TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), -+ TP_ARGS(trans, nr, size), -+ -+ TP_STRUCT__entry( -+ __field(size_t, nr ) -+ __field(size_t, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->nr = nr; -+ __entry->size = size; -+ ), -+ -+ TP_printk("%zu/%zu", __entry->nr, __entry->size) -+); -+ -+#endif /* _TRACE_BCACHEFS_H */ -+ -+/* This part must be outside protection */ -+#undef TRACE_INCLUDE_PATH -+#define TRACE_INCLUDE_PATH ../../fs/bcachefs -+ -+#undef TRACE_INCLUDE_FILE -+#define TRACE_INCLUDE_FILE trace -+ -+#include -diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c -new file mode 100644 -index 000000000..9764c2e6a ---- /dev/null -+++ b/fs/bcachefs/two_state_shared_lock.c -@@ -0,0 +1,8 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "two_state_shared_lock.h" -+ -+void __bch2_two_state_lock(two_state_lock_t *lock, int s) -+{ -+ __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); -+} -diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h -new file mode 100644 -index 000000000..905801772 ---- /dev/null -+++ b/fs/bcachefs/two_state_shared_lock.h -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_TWO_STATE_LOCK_H -+#define _BCACHEFS_TWO_STATE_LOCK_H -+ -+#include -+#include -+#include -+ -+#include "util.h" -+ -+/* -+ * Two-state lock - can be taken for add or block - both states are shared, -+ * like read side of rwsem, but conflict with other state: -+ */ -+typedef struct { -+ atomic_long_t v; -+ wait_queue_head_t wait; -+} two_state_lock_t; -+ -+static inline void two_state_lock_init(two_state_lock_t *lock) -+{ -+ atomic_long_set(&lock->v, 0); -+ init_waitqueue_head(&lock->wait); -+} -+ -+static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) -+{ -+ long i = s ? 1 : -1; -+ -+ EBUG_ON(atomic_long_read(&lock->v) == 0); -+ -+ if (atomic_long_sub_return_release(i, &lock->v) == 0) -+ wake_up_all(&lock->wait); -+} -+ -+static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) -+{ -+ long i = s ? 1 : -1; -+ long v = atomic_long_read(&lock->v), old; -+ -+ do { -+ old = v; -+ -+ if (i > 0 ? v < 0 : v > 0) -+ return false; -+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, -+ old, old + i)) != old); -+ return true; -+} -+ -+void __bch2_two_state_lock(two_state_lock_t *, int); -+ -+static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) -+{ -+ if (!bch2_two_state_trylock(lock, s)) -+ __bch2_two_state_lock(lock, s); -+} -+ -+#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -new file mode 100644 -index 000000000..636f1fa42 ---- /dev/null -+++ b/fs/bcachefs/util.c -@@ -0,0 +1,1144 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * random utiility code, for bcache but in theory not specific to bcache -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "eytzinger.h" -+#include "util.h" -+ -+static const char si_units[] = "?kMGTPEZY"; -+ -+/* string_get_size units: */ -+static const char *const units_2[] = { -+ "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" -+}; -+static const char *const units_10[] = { -+ "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -+}; -+ -+static int parse_u64(const char *cp, u64 *res) -+{ -+ const char *start = cp; -+ u64 v = 0; -+ -+ if (!isdigit(*cp)) -+ return -EINVAL; -+ -+ do { -+ if (v > U64_MAX / 10) -+ return -ERANGE; -+ v *= 10; -+ if (v > U64_MAX - (*cp - '0')) -+ return -ERANGE; -+ v += *cp - '0'; -+ cp++; -+ } while (isdigit(*cp)); -+ -+ *res = v; -+ return cp - start; -+} -+ -+static int bch2_pow(u64 n, u64 p, u64 *res) -+{ -+ *res = 1; -+ -+ while (p--) { -+ if (*res > div_u64(U64_MAX, n)) -+ return -ERANGE; -+ *res *= n; -+ } -+ return 0; -+} -+ -+static int parse_unit_suffix(const char *cp, u64 *res) -+{ -+ const char *start = cp; -+ u64 base = 1024; -+ unsigned u; -+ int ret; -+ -+ if (*cp == ' ') -+ cp++; -+ -+ for (u = 1; u < strlen(si_units); u++) -+ if (*cp == si_units[u]) { -+ cp++; -+ goto got_unit; -+ } -+ -+ for (u = 0; u < ARRAY_SIZE(units_2); u++) -+ if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { -+ cp += strlen(units_2[u]); -+ goto got_unit; -+ } -+ -+ for (u = 0; u < ARRAY_SIZE(units_10); u++) -+ if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { -+ cp += strlen(units_10[u]); -+ base = 1000; -+ goto got_unit; -+ } -+ -+ *res = 1; -+ return 0; -+got_unit: -+ ret = bch2_pow(base, u, res); -+ if (ret) -+ return ret; -+ -+ return cp - start; -+} -+ -+#define parse_or_ret(cp, _f) \ -+do { \ -+ int ret = _f; \ -+ if (ret < 0) \ -+ return ret; \ -+ cp += ret; \ -+} while (0) -+ -+static int __bch2_strtou64_h(const char *cp, u64 *res) -+{ -+ const char *start = cp; -+ u64 v = 0, b, f_n = 0, f_d = 1; -+ int ret; -+ -+ parse_or_ret(cp, parse_u64(cp, &v)); -+ -+ if (*cp == '.') { -+ cp++; -+ ret = parse_u64(cp, &f_n); -+ if (ret < 0) -+ return ret; -+ cp += ret; -+ -+ ret = bch2_pow(10, ret, &f_d); -+ if (ret) -+ return ret; -+ } -+ -+ parse_or_ret(cp, parse_unit_suffix(cp, &b)); -+ -+ if (v > div_u64(U64_MAX, b)) -+ return -ERANGE; -+ v *= b; -+ -+ if (f_n > div_u64(U64_MAX, b)) -+ return -ERANGE; -+ -+ f_n = div_u64(f_n * b, f_d); -+ if (v + f_n < v) -+ return -ERANGE; -+ v += f_n; -+ -+ *res = v; -+ return cp - start; -+} -+ -+static int __bch2_strtoh(const char *cp, u64 *res, -+ u64 t_max, bool t_signed) -+{ -+ bool positive = *cp != '-'; -+ u64 v = 0; -+ -+ if (*cp == '+' || *cp == '-') -+ cp++; -+ -+ parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); -+ -+ if (*cp == '\n') -+ cp++; -+ if (*cp) -+ return -EINVAL; -+ -+ if (positive) { -+ if (v > t_max) -+ return -ERANGE; -+ } else { -+ if (v && !t_signed) -+ return -ERANGE; -+ -+ if (v > t_max + 1) -+ return -ERANGE; -+ v = -v; -+ } -+ -+ *res = v; -+ return 0; -+} -+ -+#define STRTO_H(name, type) \ -+int bch2_ ## name ## _h(const char *cp, type *res) \ -+{ \ -+ u64 v = 0; \ -+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ -+ ANYSINT_MAX(type) != ((type) ~0ULL)); \ -+ *res = v; \ -+ return ret; \ -+} -+ -+STRTO_H(strtoint, int) -+STRTO_H(strtouint, unsigned int) -+STRTO_H(strtoll, long long) -+STRTO_H(strtoull, unsigned long long) -+STRTO_H(strtou64, u64) -+ -+u64 bch2_read_flag_list(char *opt, const char * const list[]) -+{ -+ u64 ret = 0; -+ char *p, *s, *d = kstrdup(opt, GFP_KERNEL); -+ -+ if (!d) -+ return -ENOMEM; -+ -+ s = strim(d); -+ -+ while ((p = strsep(&s, ","))) { -+ int flag = match_string(list, -1, p); -+ -+ if (flag < 0) { -+ ret = -1; -+ break; -+ } -+ -+ ret |= 1 << flag; -+ } -+ -+ kfree(d); -+ -+ return ret; -+} -+ -+bool bch2_is_zero(const void *_p, size_t n) -+{ -+ const char *p = _p; -+ size_t i; -+ -+ for (i = 0; i < n; i++) -+ if (p[i]) -+ return false; -+ return true; -+} -+ -+void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) -+{ -+ while (nr_bits) -+ prt_char(out, '0' + ((v >> --nr_bits) & 1)); -+} -+ -+void bch2_print_string_as_lines(const char *prefix, const char *lines) -+{ -+ const char *p; -+ -+ if (!lines) { -+ printk("%s (null)\n", prefix); -+ return; -+ } -+ -+ console_lock(); -+ while (1) { -+ p = strchrnul(lines, '\n'); -+ printk("%s%.*s\n", prefix, (int) (p - lines), lines); -+ if (!*p) -+ break; -+ lines = p + 1; -+ } -+ console_unlock(); -+} -+ -+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) -+{ -+#ifdef CONFIG_STACKTRACE -+ unsigned nr_entries = 0; -+ int ret = 0; -+ -+ stack->nr = 0; -+ ret = darray_make_room(stack, 32); -+ if (ret) -+ return ret; -+ -+ if (!down_read_trylock(&task->signal->exec_update_lock)) -+ return -1; -+ -+ do { -+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); -+ } while (nr_entries == stack->size && -+ !(ret = darray_make_room(stack, stack->size * 2))); -+ -+ stack->nr = nr_entries; -+ up_read(&task->signal->exec_update_lock); -+ -+ return ret; -+#else -+ return 0; -+#endif -+} -+ -+void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) -+{ -+ unsigned long *i; -+ -+ darray_for_each(*stack, i) { -+ prt_printf(out, "[<0>] %pB", (void *) *i); -+ prt_newline(out); -+ } -+} -+ -+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) -+{ -+ bch_stacktrace stack = { 0 }; -+ int ret = bch2_save_backtrace(&stack, task); -+ -+ bch2_prt_backtrace(out, &stack); -+ darray_exit(&stack); -+ return ret; -+} -+ -+/* time stats: */ -+ -+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -+static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -+{ -+ unsigned i = 0; -+ -+ while (i < ARRAY_SIZE(q->entries)) { -+ struct bch2_quantile_entry *e = q->entries + i; -+ -+ if (unlikely(!e->step)) { -+ e->m = v; -+ e->step = max_t(unsigned, v / 2, 1024); -+ } else if (e->m > v) { -+ e->m = e->m >= e->step -+ ? e->m - e->step -+ : 0; -+ } else if (e->m < v) { -+ e->m = e->m + e->step > e->m -+ ? e->m + e->step -+ : U32_MAX; -+ } -+ -+ if ((e->m > v ? e->m - v : v - e->m) < e->step) -+ e->step = max_t(unsigned, e->step / 2, 1); -+ -+ if (v >= e->m) -+ break; -+ -+ i = eytzinger0_child(i, v > e->m); -+ } -+} -+ -+static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, -+ u64 start, u64 end) -+{ -+ u64 duration, freq; -+ -+ if (time_after64(end, start)) { -+ duration = end - start; -+ mean_and_variance_update(&stats->duration_stats, duration); -+ mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); -+ stats->max_duration = max(stats->max_duration, duration); -+ stats->min_duration = min(stats->min_duration, duration); -+ bch2_quantiles_update(&stats->quantiles, duration); -+ } -+ -+ if (time_after64(end, stats->last_event)) { -+ freq = end - stats->last_event; -+ mean_and_variance_update(&stats->freq_stats, freq); -+ mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); -+ stats->max_freq = max(stats->max_freq, freq); -+ stats->min_freq = min(stats->min_freq, freq); -+ stats->last_event = end; -+ } -+} -+ -+static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, -+ struct bch2_time_stat_buffer *b) -+{ -+ struct bch2_time_stat_buffer_entry *i; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&stats->lock, flags); -+ for (i = b->entries; -+ i < b->entries + ARRAY_SIZE(b->entries); -+ i++) -+ bch2_time_stats_update_one(stats, i->start, i->end); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ -+ b->nr = 0; -+} -+ -+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -+{ -+ unsigned long flags; -+ -+ WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, -+ "time_stats: min_duration = %llu, min_freq = %llu", -+ stats->min_duration, stats->min_freq); -+ -+ if (!stats->buffer) { -+ spin_lock_irqsave(&stats->lock, flags); -+ bch2_time_stats_update_one(stats, start, end); -+ -+ if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && -+ stats->duration_stats.n > 1024) -+ stats->buffer = -+ alloc_percpu_gfp(struct bch2_time_stat_buffer, -+ GFP_ATOMIC); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ } else { -+ struct bch2_time_stat_buffer *b; -+ -+ preempt_disable(); -+ b = this_cpu_ptr(stats->buffer); -+ -+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); -+ b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { -+ .start = start, -+ .end = end -+ }; -+ -+ if (unlikely(b->nr == ARRAY_SIZE(b->entries))) -+ bch2_time_stats_clear_buffer(stats, b); -+ preempt_enable(); -+ } -+} -+#endif -+ -+static const struct time_unit { -+ const char *name; -+ u64 nsecs; -+} time_units[] = { -+ { "ns", 1 }, -+ { "us", NSEC_PER_USEC }, -+ { "ms", NSEC_PER_MSEC }, -+ { "s", NSEC_PER_SEC }, -+ { "m", (u64) NSEC_PER_SEC * 60}, -+ { "h", (u64) NSEC_PER_SEC * 3600}, -+ { "eon", U64_MAX }, -+}; -+ -+static const struct time_unit *pick_time_units(u64 ns) -+{ -+ const struct time_unit *u; -+ -+ for (u = time_units; -+ u + 1 < time_units + ARRAY_SIZE(time_units) && -+ ns >= u[1].nsecs << 1; -+ u++) -+ ; -+ -+ return u; -+} -+ -+void bch2_pr_time_units(struct printbuf *out, u64 ns) -+{ -+ const struct time_unit *u = pick_time_units(ns); -+ -+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -+} -+ -+static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) -+{ -+ const struct time_unit *u = pick_time_units(ns); -+ -+ prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); -+ prt_tab_rjust(out); -+ prt_printf(out, "%s", u->name); -+} -+ -+#define TABSTOP_SIZE 12 -+ -+static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) -+{ -+ prt_str(out, name); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, ns); -+ prt_newline(out); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) -+{ -+ const struct time_unit *u; -+ s64 f_mean = 0, d_mean = 0; -+ u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; -+ int i; -+ /* -+ * avoid divide by zero -+ */ -+ if (stats->freq_stats.n) { -+ f_mean = mean_and_variance_get_mean(stats->freq_stats); -+ f_stddev = mean_and_variance_get_stddev(stats->freq_stats); -+ d_mean = mean_and_variance_get_mean(stats->duration_stats); -+ d_stddev = mean_and_variance_get_stddev(stats->duration_stats); -+ } -+ -+ printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); -+ prt_printf(out, "count:"); -+ prt_tab(out); -+ prt_printf(out, "%llu ", -+ stats->duration_stats.n); -+ printbuf_tabstop_pop(out); -+ prt_newline(out); -+ -+ printbuf_tabstops_reset(out); -+ -+ printbuf_tabstop_push(out, out->indent + 20); -+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2); -+ printbuf_tabstop_push(out, 0); -+ printbuf_tabstop_push(out, TABSTOP_SIZE + 2); -+ -+ prt_tab(out); -+ prt_printf(out, "since mount"); -+ prt_tab_rjust(out); -+ prt_tab(out); -+ prt_printf(out, "recent"); -+ prt_tab_rjust(out); -+ prt_newline(out); -+ -+ printbuf_tabstops_reset(out); -+ printbuf_tabstop_push(out, out->indent + 20); -+ printbuf_tabstop_push(out, TABSTOP_SIZE); -+ printbuf_tabstop_push(out, 2); -+ printbuf_tabstop_push(out, TABSTOP_SIZE); -+ -+ prt_printf(out, "duration of events"); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ pr_name_and_units(out, "min:", stats->min_duration); -+ pr_name_and_units(out, "max:", stats->max_duration); -+ -+ prt_printf(out, "mean:"); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, d_mean); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); -+ prt_newline(out); -+ -+ prt_printf(out, "stddev:"); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, d_stddev); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); -+ -+ printbuf_indent_sub(out, 2); -+ prt_newline(out); -+ -+ prt_printf(out, "time between events"); -+ prt_newline(out); -+ printbuf_indent_add(out, 2); -+ -+ pr_name_and_units(out, "min:", stats->min_freq); -+ pr_name_and_units(out, "max:", stats->max_freq); -+ -+ prt_printf(out, "mean:"); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, f_mean); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); -+ prt_newline(out); -+ -+ prt_printf(out, "stddev:"); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, f_stddev); -+ prt_tab(out); -+ bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); -+ -+ printbuf_indent_sub(out, 2); -+ prt_newline(out); -+ -+ printbuf_tabstops_reset(out); -+ -+ i = eytzinger0_first(NR_QUANTILES); -+ u = pick_time_units(stats->quantiles.entries[i].m); -+ -+ prt_printf(out, "quantiles (%s):\t", u->name); -+ eytzinger0_for_each(i, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ -+ q = max(stats->quantiles.entries[i].m, last_q); -+ prt_printf(out, "%llu ", -+ div_u64(q, u->nsecs)); -+ if (is_last) -+ prt_newline(out); -+ last_q = q; -+ } -+} -+ -+void bch2_time_stats_exit(struct bch2_time_stats *stats) -+{ -+ free_percpu(stats->buffer); -+} -+ -+void bch2_time_stats_init(struct bch2_time_stats *stats) -+{ -+ memset(stats, 0, sizeof(*stats)); -+ stats->duration_stats_weighted.weight = 8; -+ stats->freq_stats_weighted.weight = 8; -+ stats->min_duration = U64_MAX; -+ stats->min_freq = U64_MAX; -+ spin_lock_init(&stats->lock); -+} -+ -+/* ratelimit: */ -+ -+/** -+ * bch2_ratelimit_delay() - return how long to delay until the next time to do -+ * some work -+ * -+ * @d - the struct bch_ratelimit to update -+ * -+ * Returns the amount of time to delay by, in jiffies -+ */ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -+{ -+ u64 now = local_clock(); -+ -+ return time_after64(d->next, now) -+ ? nsecs_to_jiffies(d->next - now) -+ : 0; -+} -+ -+/** -+ * bch2_ratelimit_increment() - increment @d by the amount of work done -+ * -+ * @d - the struct bch_ratelimit to update -+ * @done - the amount of work done, in arbitrary units -+ */ -+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -+{ -+ u64 now = local_clock(); -+ -+ d->next += div_u64(done * NSEC_PER_SEC, d->rate); -+ -+ if (time_before64(now + NSEC_PER_SEC, d->next)) -+ d->next = now + NSEC_PER_SEC; -+ -+ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) -+ d->next = now - NSEC_PER_SEC * 2; -+} -+ -+/* pd controller: */ -+ -+/* -+ * Updates pd_controller. Attempts to scale inputed values to units per second. -+ * @target: desired value -+ * @actual: current value -+ * -+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing -+ * it makes actual go down. -+ */ -+void bch2_pd_controller_update(struct bch_pd_controller *pd, -+ s64 target, s64 actual, int sign) -+{ -+ s64 proportional, derivative, change; -+ -+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; -+ -+ if (seconds_since_update == 0) -+ return; -+ -+ pd->last_update = jiffies; -+ -+ proportional = actual - target; -+ proportional *= seconds_since_update; -+ proportional = div_s64(proportional, pd->p_term_inverse); -+ -+ derivative = actual - pd->last_actual; -+ derivative = div_s64(derivative, seconds_since_update); -+ derivative = ewma_add(pd->smoothed_derivative, derivative, -+ (pd->d_term / seconds_since_update) ?: 1); -+ derivative = derivative * pd->d_term; -+ derivative = div_s64(derivative, pd->p_term_inverse); -+ -+ change = proportional + derivative; -+ -+ /* Don't increase rate if not keeping up */ -+ if (change > 0 && -+ pd->backpressure && -+ time_after64(local_clock(), -+ pd->rate.next + NSEC_PER_MSEC)) -+ change = 0; -+ -+ change *= (sign * -1); -+ -+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, -+ 1, UINT_MAX); -+ -+ pd->last_actual = actual; -+ pd->last_derivative = derivative; -+ pd->last_proportional = proportional; -+ pd->last_change = change; -+ pd->last_target = target; -+} -+ -+void bch2_pd_controller_init(struct bch_pd_controller *pd) -+{ -+ pd->rate.rate = 1024; -+ pd->last_update = jiffies; -+ pd->p_term_inverse = 6000; -+ pd->d_term = 30; -+ pd->d_smooth = pd->d_term; -+ pd->backpressure = 1; -+} -+ -+void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) -+{ -+ if (!out->nr_tabstops) -+ printbuf_tabstop_push(out, 20); -+ -+ prt_printf(out, "rate:"); -+ prt_tab(out); -+ prt_human_readable_s64(out, pd->rate.rate); -+ prt_newline(out); -+ -+ prt_printf(out, "target:"); -+ prt_tab(out); -+ prt_human_readable_u64(out, pd->last_target); -+ prt_newline(out); -+ -+ prt_printf(out, "actual:"); -+ prt_tab(out); -+ prt_human_readable_u64(out, pd->last_actual); -+ prt_newline(out); -+ -+ prt_printf(out, "proportional:"); -+ prt_tab(out); -+ prt_human_readable_s64(out, pd->last_proportional); -+ prt_newline(out); -+ -+ prt_printf(out, "derivative:"); -+ prt_tab(out); -+ prt_human_readable_s64(out, pd->last_derivative); -+ prt_newline(out); -+ -+ prt_printf(out, "change:"); -+ prt_tab(out); -+ prt_human_readable_s64(out, pd->last_change); -+ prt_newline(out); -+ -+ prt_printf(out, "next io:"); -+ prt_tab(out); -+ prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); -+ prt_newline(out); -+} -+ -+/* misc: */ -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t size) -+{ -+ while (size) { -+ struct page *page = is_vmalloc_addr(base) -+ ? vmalloc_to_page(base) -+ : virt_to_page(base); -+ unsigned offset = offset_in_page(base); -+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, offset)); -+ size -= len; -+ base += len; -+ } -+} -+ -+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -+{ -+ while (size) { -+ struct page *page = alloc_pages(gfp_mask, 0); -+ unsigned len = min_t(size_t, PAGE_SIZE, size); -+ -+ if (!page) -+ return -ENOMEM; -+ -+ if (unlikely(!bio_add_page(bio, page, len, 0))) { -+ __free_page(page); -+ break; -+ } -+ -+ size -= len; -+ } -+ -+ return 0; -+} -+ -+size_t bch2_rand_range(size_t max) -+{ -+ size_t rand; -+ -+ if (!max) -+ return 0; -+ -+ do { -+ rand = get_random_long(); -+ rand &= roundup_pow_of_two(max) - 1; -+ } while (rand >= max); -+ -+ return rand; -+} -+ -+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, dst, iter, dst_iter) { -+ void *dstp = kmap_local_page(bv.bv_page); -+ -+ memcpy(dstp + bv.bv_offset, src, bv.bv_len); -+ kunmap_local(dstp); -+ -+ src += bv.bv_len; -+ } -+} -+ -+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, src, iter, src_iter) { -+ void *srcp = kmap_local_page(bv.bv_page); -+ -+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); -+ kunmap_local(srcp); -+ -+ dst += bv.bv_len; -+ } -+} -+ -+static int alignment_ok(const void *base, size_t align) -+{ -+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || -+ ((unsigned long)base & (align - 1)) == 0; -+} -+ -+static void u32_swap(void *a, void *b, size_t size) -+{ -+ u32 t = *(u32 *)a; -+ *(u32 *)a = *(u32 *)b; -+ *(u32 *)b = t; -+} -+ -+static void u64_swap(void *a, void *b, size_t size) -+{ -+ u64 t = *(u64 *)a; -+ *(u64 *)a = *(u64 *)b; -+ *(u64 *)b = t; -+} -+ -+static void generic_swap(void *a, void *b, size_t size) -+{ -+ char t; -+ -+ do { -+ t = *(char *)a; -+ *(char *)a++ = *(char *)b; -+ *(char *)b++ = t; -+ } while (--size > 0); -+} -+ -+static inline int do_cmp(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ size_t l, size_t r) -+{ -+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+static inline void do_swap(void *base, size_t n, size_t size, -+ void (*swap_func)(void *, void *, size_t), -+ size_t l, size_t r) -+{ -+ swap_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+void eytzinger0_sort(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)) -+{ -+ int i, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for (i = n / 2 - 1; i >= 0; --i) { -+ for (r = i; r * 2 + 1 < n; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < n && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - 1; i > 0; --i) { -+ do_swap(base, n, size, swap_func, 0, i); -+ -+ for (r = 0; r * 2 + 1 < i; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < i && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t size)) -+{ -+ /* pre-scale counters for performance */ -+ int i = (num/2 - 1) * size, n = num * size, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for ( ; i >= 0; i -= size) { -+ for (r = i; r * 2 + size < n; r = c) { -+ c = r * 2 + size; -+ if (c < n - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - size; i > 0; i -= size) { -+ swap_func(base, base + i, size); -+ for (r = 0; r * 2 + size < i; r = c) { -+ c = r * 2 + size; -+ if (c < i - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+} -+ -+static void mempool_free_vp(void *element, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ vpfree(element, size); -+} -+ -+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ return vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -+{ -+ return size < PAGE_SIZE -+ ? mempool_init_kmalloc_pool(pool, min_nr, size) -+ : mempool_init(pool, min_nr, mempool_alloc_vp, -+ mempool_free_vp, (void *) size); -+} -+ -+#if 0 -+void eytzinger1_test(void) -+{ -+ unsigned inorder, eytz, size; -+ -+ pr_info("1 based eytzinger test:"); -+ -+ for (size = 2; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger1_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -+ -+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ -+ inorder = 1; -+ eytzinger1_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger1_last(size) && -+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+void eytzinger0_test(void) -+{ -+ -+ unsigned inorder, eytz, size; -+ -+ pr_info("0 based eytzinger test:"); -+ -+ for (size = 1; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger0_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -+ -+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ -+ inorder = 0; -+ eytzinger0_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger0_last(size) && -+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+{ -+ const u16 *l = _l, *r = _r; -+ -+ return (*l > *r) - (*r - *l); -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ int i, c1 = -1, c2 = -1; -+ ssize_t r; -+ -+ r = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) -+ c1 = test_array[r]; -+ -+ for (i = 0; i < nr; i++) -+ if (test_array[i] <= search && test_array[i] > c2) -+ c2 = test_array[i]; -+ -+ if (c1 != c2) { -+ eytzinger0_for_each(i, nr) -+ pr_info("[%3u] = %12u", i, test_array[i]); -+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -+ i, r, c1, c2); -+ } -+} -+ -+void eytzinger0_find_test(void) -+{ -+ unsigned i, nr, allocated = 1 << 12; -+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); -+ -+ for (nr = 1; nr < allocated; nr++) { -+ pr_info("testing %u elems", nr); -+ -+ get_random_bytes(test_array, nr * sizeof(test_array[0])); -+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); -+ -+ /* verify array is sorted correctly: */ -+ eytzinger0_for_each(i, nr) -+ BUG_ON(i != eytzinger0_last(nr) && -+ test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ -+ for (i = 0; i < U16_MAX; i += 1 << 12) -+ eytzinger0_find_test_val(test_array, nr, i); -+ -+ for (i = 0; i < nr; i++) { -+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); -+ eytzinger0_find_test_val(test_array, nr, test_array[i]); -+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); -+ } -+ } -+ -+ kfree(test_array); -+} -+#endif -+ -+/* -+ * Accumulate percpu counters onto one cpu's copy - only valid when access -+ * against any percpu counter is guarded against -+ */ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -+{ -+ u64 *ret; -+ int cpu; -+ -+ /* access to pcpu vars has to be blocked by other locking */ -+ preempt_disable(); -+ ret = this_cpu_ptr(p); -+ preempt_enable(); -+ -+ for_each_possible_cpu(cpu) { -+ u64 *i = per_cpu_ptr(p, cpu); -+ -+ if (i != ret) { -+ acc_u64s(ret, i, nr); -+ memset(i, 0, nr * sizeof(u64)); -+ } -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -new file mode 100644 -index 000000000..19cc6bfe9 ---- /dev/null -+++ b/fs/bcachefs/util.h -@@ -0,0 +1,851 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_UTIL_H -+#define _BCACHEFS_UTIL_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "darray.h" -+ -+struct closure; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) -+#endif -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+#define CPU_BIG_ENDIAN 0 -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+#define CPU_BIG_ENDIAN 1 -+#endif -+ -+/* type hackery */ -+ -+#define type_is_exact(_val, _type) \ -+ __builtin_types_compatible_p(typeof(_val), _type) -+ -+#define type_is(_val, _type) \ -+ (__builtin_types_compatible_p(typeof(_val), _type) || \ -+ __builtin_types_compatible_p(typeof(_val), const _type)) -+ -+/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -+static inline size_t buf_pages(void *p, size_t len) -+{ -+ return DIV_ROUND_UP(len + -+ ((unsigned long) p & (PAGE_SIZE - 1)), -+ PAGE_SIZE); -+} -+ -+static inline void vpfree(void *p, size_t size) -+{ -+ if (is_vmalloc_addr(p)) -+ vfree(p); -+ else -+ free_pages((unsigned long) p, get_order(size)); -+} -+ -+static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc(size, gfp_mask); -+} -+ -+static inline void kvpfree(void *p, size_t size) -+{ -+ if (size < PAGE_SIZE) -+ kfree(p); -+ else -+ vpfree(p, size); -+} -+ -+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return size < PAGE_SIZE -+ ? kmalloc(size, gfp_mask) -+ : vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); -+ -+#define HEAP(type) \ -+struct { \ -+ size_t size, used; \ -+ type *data; \ -+} -+ -+#define DECLARE_HEAP(type, name) HEAP(type) name -+ -+#define init_heap(heap, _size, gfp) \ -+({ \ -+ (heap)->used = 0; \ -+ (heap)->size = (_size); \ -+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ -+ (gfp)); \ -+}) -+ -+#define free_heap(heap) \ -+do { \ -+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ -+ (heap)->data = NULL; \ -+} while (0) -+ -+#define heap_set_backpointer(h, i, _fn) \ -+do { \ -+ void (*fn)(typeof(h), size_t) = _fn; \ -+ if (fn) \ -+ fn(h, i); \ -+} while (0) -+ -+#define heap_swap(h, i, j, set_backpointer) \ -+do { \ -+ swap((h)->data[i], (h)->data[j]); \ -+ heap_set_backpointer(h, i, set_backpointer); \ -+ heap_set_backpointer(h, j, set_backpointer); \ -+} while (0) -+ -+#define heap_peek(h) \ -+({ \ -+ EBUG_ON(!(h)->used); \ -+ (h)->data[0]; \ -+}) -+ -+#define heap_full(h) ((h)->used == (h)->size) -+ -+#define heap_sift_down(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _c, _j = i; \ -+ \ -+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ -+ _c = _j * 2 + 1; \ -+ if (_c + 1 < (h)->used && \ -+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ -+ _c++; \ -+ \ -+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ -+ break; \ -+ heap_swap(h, _c, _j, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_sift_up(h, i, cmp, set_backpointer) \ -+do { \ -+ while (i) { \ -+ size_t p = (i - 1) / 2; \ -+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ -+ break; \ -+ heap_swap(h, i, p, set_backpointer); \ -+ i = p; \ -+ } \ -+} while (0) -+ -+#define __heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ size_t _i = (h)->used++; \ -+ (h)->data[_i] = d; \ -+ heap_set_backpointer(h, _i, set_backpointer); \ -+ \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ _i; \ -+}) -+ -+#define heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = !heap_full(h); \ -+ if (_r) \ -+ __heap_add(h, d, cmp, set_backpointer); \ -+ _r; \ -+}) -+ -+#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -+do { \ -+ if (!heap_add(h, new, cmp, set_backpointer) && \ -+ cmp(h, new, heap_peek(h)) >= 0) { \ -+ (h)->data[0] = new; \ -+ heap_set_backpointer(h, 0, set_backpointer); \ -+ heap_sift_down(h, 0, cmp, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_del(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _i = (i); \ -+ \ -+ BUG_ON(_i >= (h)->used); \ -+ (h)->used--; \ -+ if ((_i) < (h)->used) { \ -+ heap_swap(h, _i, (h)->used, set_backpointer); \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ heap_sift_down(h, _i, cmp, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_pop(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = (h)->used; \ -+ if (_r) { \ -+ (d) = (h)->data[0]; \ -+ heap_del(h, 0, cmp, set_backpointer); \ -+ } \ -+ _r; \ -+}) -+ -+#define heap_resort(heap, cmp, set_backpointer) \ -+do { \ -+ ssize_t _i; \ -+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ -+ heap_sift_down(heap, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define ANYSINT_MAX(t) \ -+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -+ -+#include "printbuf.h" -+ -+#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) -+#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) -+#define printbuf_str(_buf) bch2_printbuf_str(_buf) -+#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) -+ -+#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) -+#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) -+#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) -+ -+#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) -+#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) -+ -+#define prt_newline(_out) bch2_prt_newline(_out) -+#define prt_tab(_out) bch2_prt_tab(_out) -+#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) -+ -+#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) -+#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) -+#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) -+#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) -+#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) -+#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) -+#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) -+#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) -+ -+void bch2_pr_time_units(struct printbuf *, u64); -+ -+#ifdef __KERNEL__ -+static inline void pr_time(struct printbuf *out, u64 time) -+{ -+ prt_printf(out, "%llu", time); -+} -+#else -+#include -+static inline void pr_time(struct printbuf *out, u64 _time) -+{ -+ char time_str[64]; -+ time_t time = _time; -+ struct tm *tm = localtime(&time); -+ size_t err = strftime(time_str, sizeof(time_str), "%c", tm); -+ if (!err) -+ prt_printf(out, "(formatting error)"); -+ else -+ prt_printf(out, "%s", time_str); -+} -+#endif -+ -+#ifdef __KERNEL__ -+static inline void uuid_unparse_lower(u8 *uuid, char *out) -+{ -+ sprintf(out, "%pUb", uuid); -+} -+#else -+#include -+#endif -+ -+static inline void pr_uuid(struct printbuf *out, u8 *uuid) -+{ -+ char uuid_str[40]; -+ -+ uuid_unparse_lower(uuid, uuid_str); -+ prt_printf(out, "%s", uuid_str); -+} -+ -+int bch2_strtoint_h(const char *, int *); -+int bch2_strtouint_h(const char *, unsigned int *); -+int bch2_strtoll_h(const char *, long long *); -+int bch2_strtoull_h(const char *, unsigned long long *); -+int bch2_strtou64_h(const char *, u64 *); -+ -+static inline int bch2_strtol_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtoint_h(cp, (int *) res); -+#else -+ return bch2_strtoll_h(cp, (long long *) res); -+#endif -+} -+ -+static inline int bch2_strtoul_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtouint_h(cp, (unsigned int *) res); -+#else -+ return bch2_strtoull_h(cp, (unsigned long long *) res); -+#endif -+} -+ -+#define strtoi_h(cp, res) \ -+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ -+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ -+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ -+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ -+ : -EINVAL) -+ -+#define strtoul_safe(cp, var) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = _v; \ -+ _r; \ -+}) -+ -+#define strtoul_safe_clamp(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = clamp_t(typeof(var), _v, min, max); \ -+ _r; \ -+}) -+ -+#define strtoul_safe_restrict(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r && _v >= min && _v <= max) \ -+ var = _v; \ -+ else \ -+ _r = -EINVAL; \ -+ _r; \ -+}) -+ -+#define snprint(out, var) \ -+ prt_printf(out, \ -+ type_is(var, int) ? "%i\n" \ -+ : type_is(var, unsigned) ? "%u\n" \ -+ : type_is(var, long) ? "%li\n" \ -+ : type_is(var, unsigned long) ? "%lu\n" \ -+ : type_is(var, s64) ? "%lli\n" \ -+ : type_is(var, u64) ? "%llu\n" \ -+ : type_is(var, char *) ? "%s\n" \ -+ : "%i\n", var) -+ -+bool bch2_is_zero(const void *, size_t); -+ -+u64 bch2_read_flag_list(char *, const char * const[]); -+ -+void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); -+ -+void bch2_print_string_as_lines(const char *prefix, const char *lines); -+ -+typedef DARRAY(unsigned long) bch_stacktrace; -+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); -+void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); -+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); -+ -+#define NR_QUANTILES 15 -+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -+ -+struct bch2_quantiles { -+ struct bch2_quantile_entry { -+ u64 m; -+ u64 step; -+ } entries[NR_QUANTILES]; -+}; -+ -+struct bch2_time_stat_buffer { -+ unsigned nr; -+ struct bch2_time_stat_buffer_entry { -+ u64 start; -+ u64 end; -+ } entries[32]; -+}; -+ -+struct bch2_time_stats { -+ spinlock_t lock; -+ /* all fields are in nanoseconds */ -+ u64 max_duration; -+ u64 min_duration; -+ u64 max_freq; -+ u64 min_freq; -+ u64 last_event; -+ struct bch2_quantiles quantiles; -+ -+ struct mean_and_variance duration_stats; -+ struct mean_and_variance_weighted duration_stats_weighted; -+ struct mean_and_variance freq_stats; -+ struct mean_and_variance_weighted freq_stats_weighted; -+ struct bch2_time_stat_buffer __percpu *buffer; -+}; -+ -+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); -+#else -+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -+#endif -+ -+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -+{ -+ __bch2_time_stats_update(stats, start, local_clock()); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -+ -+void bch2_time_stats_exit(struct bch2_time_stats *); -+void bch2_time_stats_init(struct bch2_time_stats *); -+ -+#define ewma_add(ewma, val, weight) \ -+({ \ -+ typeof(ewma) _ewma = (ewma); \ -+ typeof(weight) _weight = (weight); \ -+ \ -+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -+}) -+ -+struct bch_ratelimit { -+ /* Next time we want to do some work, in nanoseconds */ -+ u64 next; -+ -+ /* -+ * Rate at which we want to do work, in units per nanosecond -+ * The units here correspond to the units passed to -+ * bch2_ratelimit_increment() -+ */ -+ unsigned rate; -+}; -+ -+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -+{ -+ d->next = local_clock(); -+} -+ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *); -+void bch2_ratelimit_increment(struct bch_ratelimit *, u64); -+ -+struct bch_pd_controller { -+ struct bch_ratelimit rate; -+ unsigned long last_update; -+ -+ s64 last_actual; -+ s64 smoothed_derivative; -+ -+ unsigned p_term_inverse; -+ unsigned d_smooth; -+ unsigned d_term; -+ -+ /* for exporting to sysfs (no effect on behavior) */ -+ s64 last_derivative; -+ s64 last_proportional; -+ s64 last_change; -+ s64 last_target; -+ -+ /* -+ * If true, the rate will not increase if bch2_ratelimit_delay() -+ * is not being called often enough. -+ */ -+ bool backpressure; -+}; -+ -+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -+void bch2_pd_controller_init(struct bch_pd_controller *); -+void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); -+ -+#define sysfs_pd_controller_attribute(name) \ -+ rw_attribute(name##_rate); \ -+ rw_attribute(name##_rate_bytes); \ -+ rw_attribute(name##_rate_d_term); \ -+ rw_attribute(name##_rate_p_term_inverse); \ -+ read_attribute(name##_rate_debug) -+ -+#define sysfs_pd_controller_files(name) \ -+ &sysfs_##name##_rate, \ -+ &sysfs_##name##_rate_bytes, \ -+ &sysfs_##name##_rate_d_term, \ -+ &sysfs_##name##_rate_p_term_inverse, \ -+ &sysfs_##name##_rate_debug -+ -+#define sysfs_pd_controller_show(name, var) \ -+do { \ -+ sysfs_hprint(name##_rate, (var)->rate.rate); \ -+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ -+ sysfs_print(name##_rate_d_term, (var)->d_term); \ -+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ -+ \ -+ if (attr == &sysfs_##name##_rate_debug) \ -+ bch2_pd_controller_debug_to_text(out, var); \ -+} while (0) -+ -+#define sysfs_pd_controller_store(name, var) \ -+do { \ -+ sysfs_strtoul_clamp(name##_rate, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul_clamp(name##_rate_bytes, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ -+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ -+ (var)->p_term_inverse, 1, INT_MAX); \ -+} while (0) -+ -+#define container_of_or_null(ptr, type, member) \ -+({ \ -+ typeof(ptr) _ptr = ptr; \ -+ _ptr ? container_of(_ptr, type, member) : NULL; \ -+}) -+ -+/* Does linear interpolation between powers of two */ -+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -+{ -+ unsigned fract = x & ~(~0 << fract_bits); -+ -+ x >>= fract_bits; -+ x = 1 << x; -+ x += (x * fract) >> fract_bits; -+ -+ return x; -+} -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -+ -+static inline sector_t bdev_sectors(struct block_device *bdev) -+{ -+ return bdev->bd_inode->i_size >> 9; -+} -+ -+#define closure_bio_submit(bio, cl) \ -+do { \ -+ closure_get(cl); \ -+ submit_bio(bio); \ -+} while (0) -+ -+#define kthread_wait(cond) \ -+({ \ -+ int _ret = 0; \ -+ \ -+ while (1) { \ -+ set_current_state(TASK_INTERRUPTIBLE); \ -+ if (kthread_should_stop()) { \ -+ _ret = -1; \ -+ break; \ -+ } \ -+ \ -+ if (cond) \ -+ break; \ -+ \ -+ schedule(); \ -+ } \ -+ set_current_state(TASK_RUNNING); \ -+ _ret; \ -+}) -+ -+#define kthread_wait_freezable(cond) \ -+({ \ -+ int _ret = 0; \ -+ while (1) { \ -+ set_current_state(TASK_INTERRUPTIBLE); \ -+ if (kthread_should_stop()) { \ -+ _ret = -1; \ -+ break; \ -+ } \ -+ \ -+ if (cond) \ -+ break; \ -+ \ -+ schedule(); \ -+ try_to_freeze(); \ -+ } \ -+ set_current_state(TASK_RUNNING); \ -+ _ret; \ -+}) -+ -+size_t bch2_rand_range(size_t); -+ -+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -+void memcpy_from_bio(void *, struct bio *, struct bvec_iter); -+ -+static inline void memcpy_u64s_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+} -+ -+static inline void __memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ -+ asm volatile("rep ; movsq" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+#endif -+} -+ -+static inline void memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || -+ dst + u64s * sizeof(u64) <= src)); -+ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst > src); -+ -+ __memmove_u64s_down(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_down_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ memcpy_u64s_small(dst, src, u64s); -+} -+ -+static inline void memmove_u64s_down_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst > src); -+ -+ __memmove_u64s_down_small(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up_small(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s; -+ u64 *src = (u64 *) _src + u64s; -+ -+ while (u64s--) -+ *--dst = *--src; -+} -+ -+static inline void memmove_u64s_up_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up_small(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s - 1; -+ u64 *src = (u64 *) _src + u64s - 1; -+ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ -+ asm volatile("std ;\n" -+ "rep ; movsq\n" -+ "cld ;\n" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ while (u64s--) -+ *dst-- = *src--; -+#endif -+} -+ -+static inline void memmove_u64s_up(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+static inline void memmove_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ if (dst < src) -+ __memmove_u64s_down(dst, src, u64s); -+ else -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -+static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -+{ -+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; -+ -+ memset(s + bytes, c, rem); -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+/* just the memmove, doesn't update @_nr */ -+#define __array_insert_item(_array, _nr, _pos) \ -+ memmove(&(_array)[(_pos) + 1], \ -+ &(_array)[(_pos)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))) -+ -+#define array_insert_item(_array, _nr, _pos, _new_item) \ -+do { \ -+ __array_insert_item(_array, _nr, _pos); \ -+ (_nr)++; \ -+ (_array)[(_pos)] = (_new_item); \ -+} while (0) -+ -+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -+do { \ -+ (_nr) -= (_nr_to_remove); \ -+ memmove(&(_array)[(_pos)], \ -+ &(_array)[(_pos) + (_nr_to_remove)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+} while (0) -+ -+#define array_remove_item(_array, _nr, _pos) \ -+ array_remove_items(_array, _nr, _pos, 1) -+ -+static inline void __move_gap(void *array, size_t element_size, -+ size_t nr, size_t size, -+ size_t old_gap, size_t new_gap) -+{ -+ size_t gap_end = old_gap + size - nr; -+ -+ if (new_gap < old_gap) { -+ size_t move = old_gap - new_gap; -+ -+ memmove(array + element_size * (gap_end - move), -+ array + element_size * (old_gap - move), -+ element_size * move); -+ } else if (new_gap > old_gap) { -+ size_t move = new_gap - old_gap; -+ -+ memmove(array + element_size * old_gap, -+ array + element_size * gap_end, -+ element_size * move); -+ } -+} -+ -+/* Move the gap in a gap buffer: */ -+#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ -+ __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) -+ -+#define bubble_sort(_base, _nr, _cmp) \ -+do { \ -+ ssize_t _i, _end; \ -+ bool _swapped = true; \ -+ \ -+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ -+ _swapped = false; \ -+ for (_i = 0; _i < _end; _i++) \ -+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ -+ swap((_base)[_i], (_base)[_i + 1]); \ -+ _swapped = true; \ -+ } \ -+ } \ -+} while (0) -+ -+static inline u64 percpu_u64_get(u64 __percpu *src) -+{ -+ u64 ret = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ ret += *per_cpu_ptr(src, cpu); -+ return ret; -+} -+ -+static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ *per_cpu_ptr(dst, cpu) = 0; -+ this_cpu_write(*dst, src); -+} -+ -+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr; i++) -+ acc[i] += src[i]; -+} -+ -+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, -+ unsigned nr) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -+} -+ -+static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(p, cpu), c, bytes); -+} -+ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -+ -+#define cmp_int(l, r) ((l > r) - (l < r)) -+ -+static inline int u8_cmp(u8 l, u8 r) -+{ -+ return cmp_int(l, r); -+} -+ -+static inline int cmp_le32(__le32 l, __le32 r) -+{ -+ return cmp_int(le32_to_cpu(l), le32_to_cpu(r)); -+} -+ -+#include -+ -+#endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c -new file mode 100644 -index 000000000..2a2ab86ed ---- /dev/null -+++ b/fs/bcachefs/varint.c -@@ -0,0 +1,123 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_VALGRIND -+#include -+#endif -+ -+#include "varint.h" -+ -+/** -+ * bch2_varint_encode - encode a variable length integer -+ * @out - destination to encode to -+ * @v - unsigned integer to encode -+ * -+ * Returns the size in bytes of the encoded integer - at most 9 bytes -+ */ -+int bch2_varint_encode(u8 *out, u64 v) -+{ -+ unsigned bits = fls64(v|1); -+ unsigned bytes = DIV_ROUND_UP(bits, 7); -+ __le64 v_le; -+ -+ if (likely(bytes < 9)) { -+ v <<= bytes; -+ v |= ~(~0 << (bytes - 1)); -+ v_le = cpu_to_le64(v); -+ memcpy(out, &v_le, bytes); -+ } else { -+ *out++ = 255; -+ bytes = 9; -+ put_unaligned_le64(v, out); -+ } -+ -+ return bytes; -+} -+ -+/** -+ * bch2_varint_decode - encode a variable length integer -+ * @in - varint to decode -+ * @end - end of buffer to decode from -+ * @out - on success, decoded integer -+ * -+ * Returns the size in bytes of the decoded integer - or -1 on failure (would -+ * have read past the end of the buffer) -+ */ -+int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) -+{ -+ unsigned bytes = likely(in < end) -+ ? ffz(*in & 255) + 1 -+ : 1; -+ u64 v; -+ -+ if (unlikely(in + bytes > end)) -+ return -1; -+ -+ if (likely(bytes < 9)) { -+ __le64 v_le = 0; -+ -+ memcpy(&v_le, in, bytes); -+ v = le64_to_cpu(v_le); -+ v >>= bytes; -+ } else { -+ v = get_unaligned_le64(++in); -+ } -+ -+ *out = v; -+ return bytes; -+} -+ -+/** -+ * bch2_varint_encode_fast - fast version of bch2_varint_encode -+ * -+ * This version assumes it's always safe to write 8 bytes to @out, even if the -+ * encoded integer would be smaller. -+ */ -+int bch2_varint_encode_fast(u8 *out, u64 v) -+{ -+ unsigned bits = fls64(v|1); -+ unsigned bytes = DIV_ROUND_UP(bits, 7); -+ -+ if (likely(bytes < 9)) { -+ v <<= bytes; -+ v |= ~(~0 << (bytes - 1)); -+ } else { -+ *out++ = 255; -+ bytes = 9; -+ } -+ -+ put_unaligned_le64(v, out); -+ return bytes; -+} -+ -+/** -+ * bch2_varint_decode_fast - fast version of bch2_varint_decode -+ * -+ * This version assumes that it is safe to read at most 8 bytes past the end of -+ * @end (we still return an error if the varint extends past @end). -+ */ -+int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) -+{ -+#ifdef CONFIG_VALGRIND -+ VALGRIND_MAKE_MEM_DEFINED(in, 8); -+#endif -+ u64 v = get_unaligned_le64(in); -+ unsigned bytes = ffz(*in) + 1; -+ -+ if (unlikely(in + bytes > end)) -+ return -1; -+ -+ if (likely(bytes < 9)) { -+ v >>= bytes; -+ v &= ~(~0ULL << (7 * bytes)); -+ } else { -+ v = get_unaligned_le64(++in); -+ } -+ -+ *out = v; -+ return bytes; -+} -diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h -new file mode 100644 -index 000000000..92a182fb3 ---- /dev/null -+++ b/fs/bcachefs/varint.h -@@ -0,0 +1,11 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_VARINT_H -+#define _BCACHEFS_VARINT_H -+ -+int bch2_varint_encode(u8 *, u64); -+int bch2_varint_decode(const u8 *, const u8 *, u64 *); -+ -+int bch2_varint_encode_fast(u8 *, u64); -+int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); -+ -+#endif /* _BCACHEFS_VARINT_H */ -diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h -new file mode 100644 -index 000000000..53a694d71 ---- /dev/null -+++ b/fs/bcachefs/vstructs.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _VSTRUCTS_H -+#define _VSTRUCTS_H -+ -+#include "util.h" -+ -+/* -+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this -+ * assumes u64 is little endian: -+ */ -+#define __vstruct_u64s(_s) \ -+({ \ -+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ -+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ -+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ -+ : ((__force u8) ((_s)->u64s))); \ -+}) -+ -+#define __vstruct_bytes(_type, _u64s) \ -+({ \ -+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ -+ \ -+ (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -+}) -+ -+#define vstruct_bytes(_s) \ -+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) -+ -+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ -+ (round_up(__vstruct_bytes(_type, _u64s), \ -+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) -+ -+#define vstruct_blocks(_s, _sector_block_bits) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) -+ -+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ -+ __vstruct_u64s(_s) + (_u64s)) -+ -+#define vstruct_sectors(_s, _sector_block_bits) \ -+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) -+ -+#define vstruct_next(_s) \ -+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_last(_s) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_end(_s) \ -+ ((void *) ((_s)->_data + __vstruct_u64s(_s))) -+ -+#define vstruct_for_each(_s, _i) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s); \ -+ _i = vstruct_next(_i)) -+ -+#define vstruct_for_each_safe(_s, _i, _t) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ -+ _i = _t) -+ -+#define vstruct_idx(_s, _idx) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) -+ -+#endif /* _VSTRUCTS_H */ -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -new file mode 100644 -index 000000000..6f6b3caf0 ---- /dev/null -+++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,649 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "fs.h" -+#include "rebalance.h" -+#include "str_hash.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -+ -+static u64 bch2_xattr_hash(const struct bch_hash_info *info, -+ const struct xattr_search_key *key) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); -+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); -+ -+ return bch2_str_hash_end(&ctx, info); -+} -+ -+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_xattr_hash(info, key); -+} -+ -+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); -+ -+ return bch2_xattr_hash(info, -+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -+} -+ -+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ const struct xattr_search_key *r = _r; -+ -+ return l.v->x_type != r->type || -+ l.v->x_name_len != r->name.len || -+ memcmp(l.v->x_name, r->name.name, r->name.len); -+} -+ -+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); -+ -+ return l.v->x_type != r.v->x_type || -+ l.v->x_name_len != r.v->x_name_len || -+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -+} -+ -+const struct bch_hash_desc bch2_xattr_hash_desc = { -+ .btree_id = BTREE_ID_xattrs, -+ .key_type = KEY_TYPE_xattr, -+ .hash_key = xattr_hash_key, -+ .hash_bkey = xattr_hash_bkey, -+ .cmp_key = xattr_cmp_key, -+ .cmp_bkey = xattr_cmp_bkey, -+}; -+ -+int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ enum bkey_invalid_flags flags, -+ struct printbuf *err) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ if (bkey_val_u64s(k.k) < -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))) { -+ prt_printf(err, "value too small (%zu < %u)", -+ bkey_val_u64s(k.k), -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ /* XXX why +4 ? */ -+ if (bkey_val_u64s(k.k) > -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)) { -+ prt_printf(err, "value too big (%zu > %u)", -+ bkey_val_u64s(k.k), -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (!handler) { -+ prt_printf(err, "invalid type (%u)", xattr.v->x_type); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { -+ prt_printf(err, "xattr name has invalid characters"); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ return 0; -+} -+ -+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (handler && handler->prefix) -+ prt_printf(out, "%s", handler->prefix); -+ else if (handler) -+ prt_printf(out, "(type %u)", xattr.v->x_type); -+ else -+ prt_printf(out, "(unknown type %u)", xattr.v->x_type); -+ -+ prt_printf(out, "%.*s:%.*s", -+ xattr.v->x_name_len, -+ xattr.v->x_name, -+ le16_to_cpu(xattr.v->x_val_len), -+ (char *) xattr_val(xattr.v)); -+} -+ -+static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, -+ const char *name, void *buffer, size_t size, int type) -+{ -+ struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); -+ struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); -+ struct btree_iter iter; -+ struct bkey_s_c_xattr xattr; -+ struct bkey_s_c k; -+ int ret; -+ -+ ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, -+ inode_inum(inode), &search, 0); -+ if (ret) -+ goto err1; -+ -+ k = bch2_btree_iter_peek_slot(&iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err2; -+ -+ xattr = bkey_s_c_to_xattr(k); -+ ret = le16_to_cpu(xattr.v->x_val_len); -+ if (buffer) { -+ if (ret > size) -+ ret = -ERANGE; -+ else -+ memcpy(buffer, xattr_val(xattr.v), ret); -+ } -+err2: -+ bch2_trans_iter_exit(trans, &iter); -+err1: -+ return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; -+} -+ -+int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ const char *name, const void *value, size_t size, -+ int type, int flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter inode_iter = { NULL }; -+ int ret; -+ -+ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); -+ if (ret) -+ return ret; -+ -+ inode_u->bi_ctime = bch2_current_time(c); -+ -+ ret = bch2_inode_write(trans, &inode_iter, inode_u); -+ bch2_trans_iter_exit(trans, &inode_iter); -+ -+ if (ret) -+ return ret; -+ -+ if (value) { -+ struct bkey_i_xattr *xattr; -+ unsigned namelen = strlen(name); -+ unsigned u64s = BKEY_U64s + -+ xattr_val_u64s(namelen, size); -+ -+ if (u64s > U8_MAX) -+ return -ERANGE; -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = type; -+ xattr->v.x_name_len = namelen; -+ xattr->v.x_val_len = cpu_to_le16(size); -+ memcpy(xattr->v.x_name, name, namelen); -+ memcpy(xattr_val(&xattr->v), value, size); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inum, &xattr->k_i, -+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| -+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(type, name, strlen(name)); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, -+ hash_info, inum, &search); -+ } -+ -+ if (bch2_err_matches(ret, ENOENT)) -+ ret = flags & XATTR_REPLACE ? -ENODATA : 0; -+ -+ return ret; -+} -+ -+struct xattr_buf { -+ char *buf; -+ size_t len; -+ size_t used; -+}; -+ -+static int __bch2_xattr_emit(const char *prefix, -+ const char *name, size_t name_len, -+ struct xattr_buf *buf) -+{ -+ const size_t prefix_len = strlen(prefix); -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (buf->buf) { -+ if (buf->used + total_len > buf->len) -+ return -ERANGE; -+ -+ memcpy(buf->buf + buf->used, prefix, prefix_len); -+ memcpy(buf->buf + buf->used + prefix_len, -+ name, name_len); -+ buf->buf[buf->used + prefix_len + name_len] = '\0'; -+ } -+ -+ buf->used += total_len; -+ return 0; -+} -+ -+static int bch2_xattr_emit(struct dentry *dentry, -+ const struct bch_xattr *xattr, -+ struct xattr_buf *buf) -+{ -+ const struct xattr_handler *handler = -+ bch2_xattr_type_to_handler(xattr->x_type); -+ -+ return handler && (!handler->list || handler->list(dentry)) -+ ? __bch2_xattr_emit(handler->prefix ?: handler->name, -+ xattr->x_name, xattr->x_name_len, buf) -+ : 0; -+} -+ -+static int bch2_xattr_list_bcachefs(struct bch_fs *c, -+ struct bch_inode_unpacked *inode, -+ struct xattr_buf *buf, -+ bool all) -+{ -+ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; -+ unsigned id; -+ int ret = 0; -+ u64 v; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ v = bch2_inode_opt_get(inode, id); -+ if (!v) -+ continue; -+ -+ if (!all && -+ !(inode->bi_fields_set & (1 << id))) -+ continue; -+ -+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], -+ strlen(bch2_inode_opts[id]), buf); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -+{ -+ struct bch_fs *c = dentry->d_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; -+ u64 offset = 0, inum = inode->ei_inode.bi_inum; -+ u32 snapshot; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ iter = (struct btree_iter) { NULL }; -+ -+ ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); -+ if (ret) -+ goto err; -+ -+ for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, -+ SPOS(inum, offset, snapshot), -+ POS(inum, U64_MAX), 0, k, ret) { -+ if (k.k->type != KEY_TYPE_xattr) -+ continue; -+ -+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); -+ if (ret) -+ break; -+ } -+ -+ offset = iter.pos.offset; -+ bch2_trans_iter_exit(&trans, &iter); -+err: -+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ goto out; -+ -+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); -+ if (ret) -+ goto out; -+ -+ ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); -+ if (ret) -+ goto out; -+ -+ return buf.used; -+out: -+ return bch2_err_class(ret); -+} -+ -+static int bch2_xattr_get_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); -+ -+ return bch2_err_class(ret); -+} -+ -+static int bch2_xattr_set_handler(const struct xattr_handler *handler, -+ struct mnt_idmap *idmap, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); -+ struct bch_inode_unpacked inode_u; -+ struct btree_trans trans; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = commit_do(&trans, NULL, NULL, 0, -+ bch2_xattr_set(&trans, inode_inum(inode), &inode_u, -+ &hash, name, value, size, -+ handler->flags, flags)); -+ if (!ret) -+ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); -+ bch2_trans_exit(&trans); -+ -+ return bch2_err_class(ret); -+} -+ -+static const struct xattr_handler bch_xattr_user_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_USER, -+}; -+ -+static bool bch2_xattr_trusted_list(struct dentry *dentry) -+{ -+ return capable(CAP_SYS_ADMIN); -+} -+ -+static const struct xattr_handler bch_xattr_trusted_handler = { -+ .prefix = XATTR_TRUSTED_PREFIX, -+ .list = bch2_xattr_trusted_list, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -+}; -+ -+static const struct xattr_handler bch_xattr_security_handler = { -+ .prefix = XATTR_SECURITY_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -+}; -+ -+#ifndef NO_BCACHEFS_FS -+ -+static int opt_to_inode_opt(int id) -+{ -+ switch (id) { -+#define x(name, ...) \ -+ case Opt_##name: return Inode_opt_##name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ return -1; -+ } -+} -+ -+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size, -+ bool all) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_opts opts = -+ bch2_inode_opts_to_opts(&inode->ei_inode); -+ const struct bch_option *opt; -+ int id, inode_opt_id; -+ struct printbuf out = PRINTBUF; -+ int ret; -+ u64 v; -+ -+ id = bch2_opt_lookup(name); -+ if (id < 0 || !bch2_opt_is_inode_opt(id)) -+ return -EINVAL; -+ -+ inode_opt_id = opt_to_inode_opt(id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + id; -+ -+ if (!bch2_opt_defined_by_id(&opts, id)) -+ return -ENODATA; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) -+ return -ENODATA; -+ -+ v = bch2_opt_get_by_id(&opts, id); -+ bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); -+ -+ ret = out.pos; -+ -+ if (out.allocation_failure) { -+ ret = -ENOMEM; -+ } else if (buffer) { -+ if (out.pos > size) -+ ret = -ERANGE; -+ else -+ memcpy(buffer, out.buf, out.pos); -+ } -+ -+ printbuf_exit(&out); -+ return ret; -+} -+ -+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, false); -+} -+ -+struct inode_opt_set { -+ int id; -+ u64 v; -+ bool defined; -+}; -+ -+static int inode_opt_set_fn(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_opt_set *s = p; -+ -+ if (s->defined) -+ bi->bi_fields_set |= 1U << s->id; -+ else -+ bi->bi_fields_set &= ~(1U << s->id); -+ -+ bch2_inode_opt_set(bi, s->id, s->v); -+ -+ return 0; -+} -+ -+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, -+ struct mnt_idmap *idmap, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ const struct bch_option *opt; -+ char *buf; -+ struct inode_opt_set s; -+ int opt_id, inode_opt_id, ret; -+ -+ opt_id = bch2_opt_lookup(name); -+ if (opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + opt_id; -+ -+ inode_opt_id = opt_to_inode_opt(opt_id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ s.id = inode_opt_id; -+ -+ if (value) { -+ u64 v = 0; -+ -+ buf = kmalloc(size + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ memcpy(buf, value, size); -+ buf[size] = '\0'; -+ -+ ret = bch2_opt_parse(c, opt, buf, &v, NULL); -+ kfree(buf); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, opt_id, v); -+ if (ret < 0) -+ return ret; -+ -+ s.v = v + 1; -+ s.defined = true; -+ } else { -+ if (!IS_ROOT(dentry)) { -+ struct bch_inode_info *dir = -+ to_bch_ei(d_inode(dentry->d_parent)); -+ -+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); -+ } else { -+ s.v = 0; -+ } -+ -+ s.defined = false; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (inode_opt_id == Inode_opt_project) { -+ /* -+ * inode fields accessible via the xattr interface are stored -+ * with a +1 bias, so that 0 means unset: -+ */ -+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (value && -+ (opt_id == Opt_background_compression || -+ opt_id == Opt_background_target)) -+ bch2_rebalance_add_work(c, inode->v.i_blocks); -+ -+ return bch2_err_class(ret); -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_handler = { -+ .prefix = "bcachefs.", -+ .get = bch2_xattr_bcachefs_get, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+static int bch2_xattr_bcachefs_get_effective( -+ const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, true); -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { -+ .prefix = "bcachefs_effective.", -+ .get = bch2_xattr_bcachefs_get_effective, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+const struct xattr_handler *bch2_xattr_handlers[] = { -+ &bch_xattr_user_handler, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ &nop_posix_acl_access, -+ &nop_posix_acl_default, -+#endif -+ &bch_xattr_trusted_handler, -+ &bch_xattr_security_handler, -+#ifndef NO_BCACHEFS_FS -+ &bch_xattr_bcachefs_handler, -+ &bch_xattr_bcachefs_effective_handler, -+#endif -+ NULL -+}; -+ -+static const struct xattr_handler *bch_xattr_handler_map[] = { -+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = -+ &nop_posix_acl_access, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = -+ &nop_posix_acl_default, -+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, -+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -+}; -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -+{ -+ return type < ARRAY_SIZE(bch_xattr_handler_map) -+ ? bch_xattr_handler_map[type] -+ : NULL; -+} -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -new file mode 100644 -index 000000000..f5a52e3a6 ---- /dev/null -+++ b/fs/bcachefs/xattr.h -@@ -0,0 +1,50 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_XATTR_H -+#define _BCACHEFS_XATTR_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_xattr_hash_desc; -+ -+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, -+ enum bkey_invalid_flags, struct printbuf *); -+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ -+ .key_invalid = bch2_xattr_invalid, \ -+ .val_to_text = bch2_xattr_to_text, \ -+ .min_val_size = 8, \ -+}) -+ -+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + -+ name_len + val_len, sizeof(u64)); -+} -+ -+#define xattr_val(_xattr) \ -+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) -+ -+struct xattr_search_key { -+ u8 type; -+ struct qstr name; -+}; -+ -+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ -+ { .type = _type, .name = QSTR_INIT(_name, _len) }) -+ -+struct dentry; -+struct xattr_handler; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+/* Exported for cmd_migrate.c in tools: */ -+int bch2_xattr_set(struct btree_trans *, subvol_inum, -+ struct bch_inode_unpacked *, const struct bch_hash_info *, -+ const char *, const void *, size_t, int, int); -+ -+ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -+ -+extern const struct xattr_handler *bch2_xattr_handlers[]; -+ -+#endif /* _BCACHEFS_XATTR_H */ -diff --git a/fs/dcache.c b/fs/dcache.c -index 52e6d5fda..dbdafa261 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -3249,11 +3249,10 @@ void d_genocide(struct dentry *parent) - - EXPORT_SYMBOL(d_genocide); - --void d_tmpfile(struct file *file, struct inode *inode) -+void d_mark_tmpfile(struct file *file, struct inode *inode) - { - struct dentry *dentry = file->f_path.dentry; - -- inode_dec_link_count(inode); - BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_u.d_alias) || - !d_unlinked(dentry)); -@@ -3263,6 +3262,15 @@ void d_tmpfile(struct file *file, struct inode *inode) - (unsigned long long)inode->i_ino); - spin_unlock(&dentry->d_lock); - spin_unlock(&dentry->d_parent->d_lock); -+} -+EXPORT_SYMBOL(d_mark_tmpfile); -+ -+void d_tmpfile(struct file *file, struct inode *inode) -+{ -+ struct dentry *dentry = file->f_path.dentry; -+ -+ inode_dec_link_count(inode); -+ d_mark_tmpfile(file, inode); - d_instantiate(dentry, inode); - } - EXPORT_SYMBOL(d_tmpfile); -diff --git a/fs/inode.c b/fs/inode.c -index 67611a360..968931eb4 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -56,8 +56,23 @@ - - static unsigned int i_hash_mask __read_mostly; - static unsigned int i_hash_shift __read_mostly; --static struct hlist_head *inode_hashtable __read_mostly; --static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -+static struct hlist_bl_head *inode_hashtable __read_mostly; -+ -+static unsigned long hash(struct super_block *sb, unsigned long hashval) -+{ -+ unsigned long tmp; -+ -+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / -+ L1_CACHE_BYTES; -+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); -+ return tmp & i_hash_mask; -+} -+ -+static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, -+ unsigned int hashval) -+{ -+ return inode_hashtable + hash(sb, hashval); -+} - - /* - * Empty aops. Can be used for the cases where the user does not -@@ -416,7 +431,7 @@ EXPORT_SYMBOL(address_space_init_once); - void inode_init_once(struct inode *inode) - { - memset(inode, 0, sizeof(*inode)); -- INIT_HLIST_NODE(&inode->i_hash); -+ INIT_HLIST_BL_NODE(&inode->i_hash); - INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_io_list); - INIT_LIST_HEAD(&inode->i_wb_list); -@@ -505,14 +520,15 @@ static inline void inode_sb_list_del(struct inode *inode) - } - } - --static unsigned long hash(struct super_block *sb, unsigned long hashval) -+/* -+ * Ensure that we store the hash head in the inode when we insert the inode into -+ * the hlist_bl_head... -+ */ -+static inline void -+__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) - { -- unsigned long tmp; -- -- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / -- L1_CACHE_BYTES; -- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); -- return tmp & i_hash_mask; -+ hlist_bl_add_head_rcu(&inode->i_hash, b); -+ inode->i_hash_head = b; - } - - /** -@@ -525,13 +541,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) - */ - void __insert_inode_hash(struct inode *inode, unsigned long hashval) - { -- struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); -+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); - -- spin_lock(&inode_hash_lock); -+ hlist_bl_lock(b); - spin_lock(&inode->i_lock); -- hlist_add_head_rcu(&inode->i_hash, b); -+ __insert_inode_hash_head(inode, b); - spin_unlock(&inode->i_lock); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - } - EXPORT_SYMBOL(__insert_inode_hash); - -@@ -543,11 +559,44 @@ EXPORT_SYMBOL(__insert_inode_hash); - */ - void __remove_inode_hash(struct inode *inode) - { -- spin_lock(&inode_hash_lock); -- spin_lock(&inode->i_lock); -- hlist_del_init_rcu(&inode->i_hash); -- spin_unlock(&inode->i_lock); -- spin_unlock(&inode_hash_lock); -+ struct hlist_bl_head *b = inode->i_hash_head; -+ -+ /* -+ * There are some callers that come through here without synchronisation -+ * and potentially with multiple references to the inode. Hence we have -+ * to handle the case that we might race with a remove and insert to a -+ * different list. Coda, in particular, seems to have a userspace API -+ * that can directly trigger "unhash/rehash to different list" behaviour -+ * without any serialisation at all. -+ * -+ * Hence we have to handle the situation where the inode->i_hash_head -+ * might point to a different list than what we expect, indicating that -+ * we raced with another unhash and potentially a new insertion. This -+ * means we have to retest the head once we have everything locked up -+ * and loop again if it doesn't match. -+ */ -+ while (b) { -+ hlist_bl_lock(b); -+ spin_lock(&inode->i_lock); -+ if (b != inode->i_hash_head) { -+ hlist_bl_unlock(b); -+ b = inode->i_hash_head; -+ spin_unlock(&inode->i_lock); -+ continue; -+ } -+ /* -+ * Need to set the pprev pointer to NULL after list removal so -+ * that both RCU traversals and hlist_bl_unhashed() work -+ * correctly at this point. -+ */ -+ hlist_bl_del_rcu(&inode->i_hash); -+ inode->i_hash.pprev = NULL; -+ inode->i_hash_head = NULL; -+ spin_unlock(&inode->i_lock); -+ hlist_bl_unlock(b); -+ break; -+ } -+ - } - EXPORT_SYMBOL(__remove_inode_hash); - -@@ -896,26 +945,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) - return freed; - } - --static void __wait_on_freeing_inode(struct inode *inode); -+static void __wait_on_freeing_inode(struct hlist_bl_head *b, -+ struct inode *inode); - /* - * Called with the inode lock held. - */ - static struct inode *find_inode(struct super_block *sb, -- struct hlist_head *head, -+ struct hlist_bl_head *b, - int (*test)(struct inode *, void *), - void *data) - { -+ struct hlist_bl_node *node; - struct inode *inode = NULL; - - repeat: -- hlist_for_each_entry(inode, head, i_hash) { -+ hlist_bl_for_each_entry(inode, node, b, i_hash) { - if (inode->i_sb != sb) - continue; - if (!test(inode, data)) - continue; - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { -- __wait_on_freeing_inode(inode); -+ __wait_on_freeing_inode(b, inode); - goto repeat; - } - if (unlikely(inode->i_state & I_CREATING)) { -@@ -934,19 +985,20 @@ static struct inode *find_inode(struct super_block *sb, - * iget_locked for details. - */ - static struct inode *find_inode_fast(struct super_block *sb, -- struct hlist_head *head, unsigned long ino) -+ struct hlist_bl_head *b, unsigned long ino) - { -+ struct hlist_bl_node *node; - struct inode *inode = NULL; - - repeat: -- hlist_for_each_entry(inode, head, i_hash) { -+ hlist_bl_for_each_entry(inode, node, b, i_hash) { - if (inode->i_ino != ino) - continue; - if (inode->i_sb != sb) - continue; - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE)) { -- __wait_on_freeing_inode(inode); -+ __wait_on_freeing_inode(b, inode); - goto repeat; - } - if (unlikely(inode->i_state & I_CREATING)) { -@@ -1196,25 +1248,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); - * return it locked, hashed, and with the I_NEW flag set. The file system gets - * to fill it in before unlocking it via unlock_new_inode(). - * -- * Note both @test and @set are called with the inode_hash_lock held, so can't -- * sleep. -+ * Note both @test and @set are called with the inode hash chain lock held, -+ * so can't sleep. - */ - struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), - int (*set)(struct inode *, void *), void *data) - { -- struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); -+ struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); - struct inode *old; - - again: -- spin_lock(&inode_hash_lock); -- old = find_inode(inode->i_sb, head, test, data); -+ hlist_bl_lock(b); -+ old = find_inode(inode->i_sb, b, test, data); - if (unlikely(old)) { - /* - * Uhhuh, somebody else created the same inode under us. - * Use the old inode instead of the preallocated one. - */ -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - if (IS_ERR(old)) - return NULL; - wait_on_inode(old); -@@ -1236,7 +1288,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - */ - spin_lock(&inode->i_lock); - inode->i_state |= I_NEW; -- hlist_add_head_rcu(&inode->i_hash, head); -+ __insert_inode_hash_head(inode, b); - spin_unlock(&inode->i_lock); - - /* -@@ -1246,7 +1298,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, - if (list_empty(&inode->i_sb_list)) - inode_sb_list_add(inode); - unlock: -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - - return inode; - } -@@ -1307,12 +1359,12 @@ EXPORT_SYMBOL(iget5_locked); - */ - struct inode *iget_locked(struct super_block *sb, unsigned long ino) - { -- struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ struct hlist_bl_head *b = i_hash_head(sb, ino); - struct inode *inode; - again: -- spin_lock(&inode_hash_lock); -- inode = find_inode_fast(sb, head, ino); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_lock(b); -+ inode = find_inode_fast(sb, b, ino); -+ hlist_bl_unlock(b); - if (inode) { - if (IS_ERR(inode)) - return NULL; -@@ -1328,17 +1380,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) - if (inode) { - struct inode *old; - -- spin_lock(&inode_hash_lock); -+ hlist_bl_lock(b); - /* We released the lock, so.. */ -- old = find_inode_fast(sb, head, ino); -+ old = find_inode_fast(sb, b, ino); - if (!old) { - inode->i_ino = ino; - spin_lock(&inode->i_lock); - inode->i_state = I_NEW; -- hlist_add_head_rcu(&inode->i_hash, head); -+ __insert_inode_hash_head(inode, b); - spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - - /* Return the locked inode with I_NEW set, the - * caller is responsible for filling in the contents -@@ -1351,7 +1403,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) - * us. Use the old inode instead of the one we just - * allocated. - */ -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - destroy_inode(inode); - if (IS_ERR(old)) - return NULL; -@@ -1375,10 +1427,11 @@ EXPORT_SYMBOL(iget_locked); - */ - static int test_inode_iunique(struct super_block *sb, unsigned long ino) - { -- struct hlist_head *b = inode_hashtable + hash(sb, ino); -+ struct hlist_bl_head *b = i_hash_head(sb, ino); -+ struct hlist_bl_node *node; - struct inode *inode; - -- hlist_for_each_entry_rcu(inode, b, i_hash) { -+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { - if (inode->i_ino == ino && inode->i_sb == sb) - return 0; - } -@@ -1462,12 +1515,12 @@ EXPORT_SYMBOL(igrab); - struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { -- struct hlist_head *head = inode_hashtable + hash(sb, hashval); -+ struct hlist_bl_head *b = i_hash_head(sb, hashval); - struct inode *inode; - -- spin_lock(&inode_hash_lock); -- inode = find_inode(sb, head, test, data); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_lock(b); -+ inode = find_inode(sb, b, test, data); -+ hlist_bl_unlock(b); - - return IS_ERR(inode) ? NULL : inode; - } -@@ -1517,12 +1570,12 @@ EXPORT_SYMBOL(ilookup5); - */ - struct inode *ilookup(struct super_block *sb, unsigned long ino) - { -- struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ struct hlist_bl_head *b = i_hash_head(sb, ino); - struct inode *inode; - again: -- spin_lock(&inode_hash_lock); -- inode = find_inode_fast(sb, head, ino); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_lock(b); -+ inode = find_inode_fast(sb, b, ino); -+ hlist_bl_unlock(b); - - if (inode) { - if (IS_ERR(inode)) -@@ -1566,12 +1619,13 @@ struct inode *find_inode_nowait(struct super_block *sb, - void *), - void *data) - { -- struct hlist_head *head = inode_hashtable + hash(sb, hashval); -+ struct hlist_bl_head *b = i_hash_head(sb, hashval); -+ struct hlist_bl_node *node; - struct inode *inode, *ret_inode = NULL; - int mval; - -- spin_lock(&inode_hash_lock); -- hlist_for_each_entry(inode, head, i_hash) { -+ hlist_bl_lock(b); -+ hlist_bl_for_each_entry(inode, node, b, i_hash) { - if (inode->i_sb != sb) - continue; - mval = match(inode, hashval, data); -@@ -1582,7 +1636,7 @@ struct inode *find_inode_nowait(struct super_block *sb, - goto out; - } - out: -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - return ret_inode; - } - EXPORT_SYMBOL(find_inode_nowait); -@@ -1611,13 +1665,14 @@ EXPORT_SYMBOL(find_inode_nowait); - struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { -- struct hlist_head *head = inode_hashtable + hash(sb, hashval); -+ struct hlist_bl_head *b = i_hash_head(sb, hashval); -+ struct hlist_bl_node *node; - struct inode *inode; - - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "suspicious find_inode_rcu() usage"); - -- hlist_for_each_entry_rcu(inode, head, i_hash) { -+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { - if (inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && - test(inode, data)) -@@ -1649,13 +1704,14 @@ EXPORT_SYMBOL(find_inode_rcu); - struct inode *find_inode_by_ino_rcu(struct super_block *sb, - unsigned long ino) - { -- struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ struct hlist_bl_head *b = i_hash_head(sb, ino); -+ struct hlist_bl_node *node; - struct inode *inode; - - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), - "suspicious find_inode_by_ino_rcu() usage"); - -- hlist_for_each_entry_rcu(inode, head, i_hash) { -+ hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { - if (inode->i_ino == ino && - inode->i_sb == sb && - !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) -@@ -1669,39 +1725,42 @@ int insert_inode_locked(struct inode *inode) - { - struct super_block *sb = inode->i_sb; - ino_t ino = inode->i_ino; -- struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ struct hlist_bl_head *b = i_hash_head(sb, ino); - - while (1) { -- struct inode *old = NULL; -- spin_lock(&inode_hash_lock); -- hlist_for_each_entry(old, head, i_hash) { -- if (old->i_ino != ino) -+ struct hlist_bl_node *node; -+ struct inode *old = NULL, *t; -+ -+ hlist_bl_lock(b); -+ hlist_bl_for_each_entry(t, node, b, i_hash) { -+ if (t->i_ino != ino) - continue; -- if (old->i_sb != sb) -+ if (t->i_sb != sb) - continue; -- spin_lock(&old->i_lock); -- if (old->i_state & (I_FREEING|I_WILL_FREE)) { -- spin_unlock(&old->i_lock); -+ spin_lock(&t->i_lock); -+ if (t->i_state & (I_FREEING|I_WILL_FREE)) { -+ spin_unlock(&t->i_lock); - continue; - } -+ old = t; - break; - } - if (likely(!old)) { - spin_lock(&inode->i_lock); - inode->i_state |= I_NEW | I_CREATING; -- hlist_add_head_rcu(&inode->i_hash, head); -+ __insert_inode_hash_head(inode, b); - spin_unlock(&inode->i_lock); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - return 0; - } - if (unlikely(old->i_state & I_CREATING)) { - spin_unlock(&old->i_lock); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - return -EBUSY; - } - __iget(old); - spin_unlock(&old->i_lock); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - wait_on_inode(old); - if (unlikely(!inode_unhashed(old))) { - iput(old); -@@ -2226,17 +2285,18 @@ EXPORT_SYMBOL(inode_needs_sync); - * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list - * will DTRT. - */ --static void __wait_on_freeing_inode(struct inode *inode) -+static void __wait_on_freeing_inode(struct hlist_bl_head *b, -+ struct inode *inode) - { - wait_queue_head_t *wq; - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); - wq = bit_waitqueue(&inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); -- spin_unlock(&inode_hash_lock); -+ hlist_bl_unlock(b); - schedule(); - finish_wait(wq, &wait.wq_entry); -- spin_lock(&inode_hash_lock); -+ hlist_bl_lock(b); - } - - static __initdata unsigned long ihash_entries; -@@ -2262,7 +2322,7 @@ void __init inode_init_early(void) - - inode_hashtable = - alloc_large_system_hash("Inode-cache", -- sizeof(struct hlist_head), -+ sizeof(struct hlist_bl_head), - ihash_entries, - 14, - HASH_EARLY | HASH_ZERO, -@@ -2288,7 +2348,7 @@ void __init inode_init(void) - - inode_hashtable = - alloc_large_system_hash("Inode-cache", -- sizeof(struct hlist_head), -+ sizeof(struct hlist_bl_head), - ihash_entries, - 14, - HASH_ZERO, -diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c -index aa8967cca..72d32603f 100644 ---- a/fs/iomap/buffered-io.c -+++ b/fs/iomap/buffered-io.c -@@ -292,8 +292,12 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - gfp_t orig_gfp = gfp; - unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - -- if (ctx->bio) -- submit_bio(ctx->bio); -+ if (ctx->bio) { -+ if (iomap->flags & IOMAP_F_NOSUBMIT) -+ bio_endio(ctx->bio); -+ else -+ submit_bio(ctx->bio); -+ } - - if (ctx->rac) /* same as readahead_gfp_mask */ - gfp |= __GFP_NORETRY | __GFP_NOWARN; -@@ -346,7 +350,10 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) - folio_set_error(folio); - - if (ctx.bio) { -- submit_bio(ctx.bio); -+ if (iter.iomap.flags & IOMAP_F_NOSUBMIT) -+ bio_endio(ctx.bio); -+ else -+ submit_bio(ctx.bio); - WARN_ON_ONCE(!ctx.cur_folio_in_bio); - } else { - WARN_ON_ONCE(ctx.cur_folio_in_bio); -@@ -418,8 +425,12 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) - while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); - -- if (ctx.bio) -- submit_bio(ctx.bio); -+ if (ctx.bio) { -+ if (iter.iomap.flags & IOMAP_F_NOSUBMIT) -+ bio_endio(ctx.bio); -+ else -+ submit_bio(ctx.bio); -+ } - if (ctx.cur_folio) { - if (!ctx.cur_folio_in_bio) - folio_unlock(ctx.cur_folio); -@@ -536,11 +547,17 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, - { - struct bio_vec bvec; - struct bio bio; -+ int ret = 0; - - bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); - bio_add_folio_nofail(&bio, folio, plen, poff); -- return submit_bio_wait(&bio); -+ -+ if (iomap->flags & IOMAP_F_NOSUBMIT) -+ bio_endio(&bio); -+ else -+ ret = submit_bio_wait(&bio); -+ return ret; - } - - static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, -@@ -1489,7 +1506,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, - return error; - } - -- submit_bio(ioend->io_bio); -+ if (wpc->iomap.flags & IOMAP_F_NOSUBMIT) -+ bio_endio(ioend->io_bio); -+ else -+ submit_bio(ioend->io_bio); - return 0; - } - -@@ -1527,8 +1547,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, - * traversal in iomap_finish_ioend(). - */ - static struct bio * --iomap_chain_bio(struct bio *prev) -+iomap_chain_bio(struct iomap_writepage_ctx *wpc) - { -+ struct bio *prev = wpc->ioend->io_bio; - struct bio *new; - - new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); -@@ -1537,7 +1558,11 @@ iomap_chain_bio(struct bio *prev) - - bio_chain(prev, new); - bio_get(prev); /* for iomap_finish_ioend */ -- submit_bio(prev); -+ -+ if (wpc->iomap.flags & IOMAP_F_NOSUBMIT) -+ bio_endio(prev); -+ else -+ submit_bio(prev); - return new; - } - -@@ -1584,7 +1609,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - } - - if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { -- wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); -+ wpc->ioend->io_bio = iomap_chain_bio(wpc); - bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); - } - -diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c -index 18c8f168b..f0003446f 100644 ---- a/fs/xfs/xfs_iomap.c -+++ b/fs/xfs/xfs_iomap.c -@@ -99,6 +99,9 @@ xfs_bmbt_to_iomap( - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - -+ if (xfs_has_nodataio(mp)) -+ iomap_flags |= IOMAP_F_NOSUBMIT; -+ - if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) - return xfs_alert_fsblock_zero(ip, imap); - -diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h -index e2866e7fa..29ecb5643 100644 ---- a/fs/xfs/xfs_mount.h -+++ b/fs/xfs/xfs_mount.h -@@ -284,6 +284,7 @@ typedef struct xfs_mount { - #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ - - /* Mount features */ -+#define XFS_FEAT_NODATAIO (1ULL << 47) /* skip all data I/O */ - #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ - #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ - #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ -@@ -353,6 +354,7 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64) - * bit inodes and read-only state, are kept as operational state rather than - * features. - */ -+__XFS_HAS_FEAT(nodataio, NODATAIO) - __XFS_HAS_FEAT(noattr2, NOATTR2) - __XFS_HAS_FEAT(noalign, NOALIGN) - __XFS_HAS_FEAT(allocsize, ALLOCSIZE) -diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c -index 818510243..b6cdce43c 100644 ---- a/fs/xfs/xfs_super.c -+++ b/fs/xfs/xfs_super.c -@@ -121,7 +121,7 @@ enum { - Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, - Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, - Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, -- Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, -+ Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_nodataio, - }; - - static const struct fs_parameter_spec xfs_fs_parameters[] = { -@@ -166,6 +166,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { - fsparam_flag("nodiscard", Opt_nodiscard), - fsparam_flag("dax", Opt_dax), - fsparam_enum("dax", Opt_dax_enum, dax_param_enums), -+ fsparam_flag("nodataio", Opt_nodataio), - {} - }; - -@@ -1396,6 +1397,9 @@ xfs_fs_parse_param( - xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); - parsing_mp->m_features |= XFS_FEAT_NOATTR2; - return 0; -+ case Opt_nodataio: -+ parsing_mp->m_features |= XFS_FEAT_NODATAIO; -+ return 0; - default: - xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); - return -EINVAL; -diff --git a/include/linux/bio.h b/include/linux/bio.h -index 11984ed29..debbd8fcb 100644 ---- a/include/linux/bio.h -+++ b/include/linux/bio.h -@@ -488,7 +488,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, - extern void bio_copy_data(struct bio *dst, struct bio *src); - extern void bio_free_pages(struct bio *bio); - void guard_bio_eod(struct bio *bio); --void zero_fill_bio(struct bio *bio); -+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); -+ -+static inline void zero_fill_bio(struct bio *bio) -+{ -+ zero_fill_bio_iter(bio, bio->bi_iter); -+} - - static inline void bio_release_pages(struct bio *bio, bool mark_dirty) - { -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 87d94be78..61ffaaba4 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -846,6 +846,7 @@ extern const char *blk_op_str(enum req_op op); - - int blk_status_to_errno(blk_status_t status); - blk_status_t errno_to_blk_status(int errno); -+const char *blk_status_to_str(blk_status_t status); - - /* only poll the hardware once, don't continue until a completion was found */ - #define BLK_POLL_ONESHOT (1 << 0) -diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h -similarity index 93% -rename from drivers/md/bcache/closure.h -rename to include/linux/closure.h -index c88cdc4ae..722a586bb 100644 ---- a/drivers/md/bcache/closure.h -+++ b/include/linux/closure.h -@@ -155,7 +155,7 @@ struct closure { - - atomic_t remaining; - --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - #define CLOSURE_MAGIC_DEAD 0xc054dead - #define CLOSURE_MAGIC_ALIVE 0xc054a11e - -@@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list); - bool closure_wait(struct closure_waitlist *list, struct closure *cl); - void __closure_sync(struct closure *cl); - -+static inline unsigned closure_nr_remaining(struct closure *cl) -+{ -+ return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; -+} -+ - /** - * closure_sync - sleep until a closure a closure has nothing left to wait on - * -@@ -180,19 +185,17 @@ void __closure_sync(struct closure *cl); - */ - static inline void closure_sync(struct closure *cl) - { -- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -+ if (closure_nr_remaining(cl) != 1) - __closure_sync(cl); - } - --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - --void closure_debug_init(void); - void closure_debug_create(struct closure *cl); - void closure_debug_destroy(struct closure *cl); - - #else - --static inline void closure_debug_init(void) {} - static inline void closure_debug_create(struct closure *cl) {} - static inline void closure_debug_destroy(struct closure *cl) {} - -@@ -200,21 +203,21 @@ static inline void closure_debug_destroy(struct closure *cl) {} - - static inline void closure_set_ip(struct closure *cl) - { --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - cl->ip = _THIS_IP_; - #endif - } - - static inline void closure_set_ret_ip(struct closure *cl) - { --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - cl->ip = _RET_IP_; - #endif - } - - static inline void closure_set_waiting(struct closure *cl, unsigned long f) - { --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - cl->waiting_on = f; - #endif - } -@@ -243,6 +246,7 @@ static inline void closure_queue(struct closure *cl) - */ - BUILD_BUG_ON(offsetof(struct closure, fn) - != offsetof(struct work_struct, func)); -+ - if (wq) { - INIT_WORK(&cl->work, cl->work.func); - BUG_ON(!queue_work(wq, &cl->work)); -@@ -255,7 +259,7 @@ static inline void closure_queue(struct closure *cl) - */ - static inline void closure_get(struct closure *cl) - { --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - BUG_ON((atomic_inc_return(&cl->remaining) & - CLOSURE_REMAINING_MASK) <= 1); - #else -@@ -271,7 +275,7 @@ static inline void closure_get(struct closure *cl) - */ - static inline void closure_init(struct closure *cl, struct closure *parent) - { -- memset(cl, 0, sizeof(struct closure)); -+ cl->fn = NULL; - cl->parent = parent; - if (parent) - closure_get(parent); -@@ -375,4 +379,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, - continue_at_nobarrier(cl, fn, wq); - } - -+#define __closure_wait_event(waitlist, _cond) \ -+do { \ -+ struct closure cl; \ -+ \ -+ closure_init_stack(&cl); \ -+ \ -+ while (1) { \ -+ closure_wait(waitlist, &cl); \ -+ if (_cond) \ -+ break; \ -+ closure_sync(&cl); \ -+ } \ -+ closure_wake_up(waitlist); \ -+ closure_sync(&cl); \ -+} while (0) -+ -+#define closure_wait_event(waitlist, _cond) \ -+do { \ -+ if (!(_cond)) \ -+ __closure_wait_event(waitlist, _cond); \ -+} while (0) -+ - #endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index 6b351e009..3da2f0545 100644 ---- a/include/linux/dcache.h -+++ b/include/linux/dcache.h -@@ -251,6 +251,7 @@ extern struct dentry * d_make_root(struct inode *); - /* - the ramfs-type tree */ - extern void d_genocide(struct dentry *); - -+extern void d_mark_tmpfile(struct file *, struct inode *); - extern void d_tmpfile(struct file *, struct inode *); - - extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h -index 11fbd0ee1..f49a7d311 100644 ---- a/include/linux/exportfs.h -+++ b/include/linux/exportfs.h -@@ -98,6 +98,12 @@ enum fid_type { - */ - FILEID_FAT_WITH_PARENT = 0x72, - -+ /* -+ * 64 bit inode number, 32 bit subvolume, 32 bit generation number: -+ */ -+ FILEID_BCACHEFS_WITHOUT_PARENT = 0x80, -+ FILEID_BCACHEFS_WITH_PARENT = 0x81, -+ - /* - * 128 bit child FID (struct lu_fid) - * 128 bit parent FID (struct lu_fid) -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 562f2623c..810fa0812 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -660,7 +660,8 @@ struct inode { - unsigned long dirtied_when; /* jiffies of first dirtying */ - unsigned long dirtied_time_when; - -- struct hlist_node i_hash; -+ struct hlist_bl_node i_hash; -+ struct hlist_bl_head *i_hash_head; - struct list_head i_io_list; /* backing dev IO list */ - #ifdef CONFIG_CGROUP_WRITEBACK - struct bdi_writeback *i_wb; /* the associated cgroup wb */ -@@ -726,7 +727,7 @@ static inline unsigned int i_blocksize(const struct inode *node) - - static inline int inode_unhashed(struct inode *inode) - { -- return hlist_unhashed(&inode->i_hash); -+ return hlist_bl_unhashed(&inode->i_hash); - } - - /* -@@ -737,7 +738,7 @@ static inline int inode_unhashed(struct inode *inode) - */ - static inline void inode_fake_hash(struct inode *inode) - { -- hlist_add_fake(&inode->i_hash); -+ hlist_bl_add_fake(&inode->i_hash); - } - - /* -@@ -2729,11 +2730,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, - * This must be used for allocating filesystems specific inodes to set - * up the inode reclaim context correctly. - */ --static inline void * --alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) --{ -- return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); --} -+#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) - - extern void __insert_inode_hash(struct inode *, unsigned long hashval); - static inline void insert_inode_hash(struct inode *inode) -@@ -2744,7 +2741,7 @@ static inline void insert_inode_hash(struct inode *inode) - extern void __remove_inode_hash(struct inode *); - static inline void remove_inode_hash(struct inode *inode) - { -- if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) -+ if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) - __remove_inode_hash(inode); - } - -diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h -index 107613f7d..c74b73769 100644 ---- a/include/linux/generic-radix-tree.h -+++ b/include/linux/generic-radix-tree.h -@@ -38,6 +38,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -116,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) - - #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) - #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) -+#define __genradix_objs_per_page(_radix) \ -+ (PAGE_SIZE / sizeof((_radix)->type[0])) -+#define __genradix_page_remainder(_radix) \ -+ (PAGE_SIZE % sizeof((_radix)->type[0])) -+ - #define __genradix_idx_to_offset(_radix, _idx) \ - __idx_to_offset(_idx, __genradix_obj_size(_radix)) - -@@ -179,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); - #define genradix_iter_peek(_iter, _radix) \ - (__genradix_cast(_radix) \ - __genradix_iter_peek(_iter, &(_radix)->tree, \ -- PAGE_SIZE / __genradix_obj_size(_radix))) -+ __genradix_objs_per_page(_radix))) -+ -+void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, -+ size_t, size_t); -+ -+/** -+ * genradix_iter_peek - get first entry at or below iterator's current -+ * position -+ * @_iter: a genradix_iter -+ * @_radix: genradix being iterated over -+ * -+ * If no more entries exist at or below @_iter's current position, returns NULL -+ */ -+#define genradix_iter_peek_prev(_iter, _radix) \ -+ (__genradix_cast(_radix) \ -+ __genradix_iter_peek_prev(_iter, &(_radix)->tree, \ -+ __genradix_objs_per_page(_radix), \ -+ __genradix_obj_size(_radix) + \ -+ __genradix_page_remainder(_radix))) - - static inline void __genradix_iter_advance(struct genradix_iter *iter, - size_t obj_size) - { -+ if (iter->offset + obj_size < iter->offset) { -+ iter->offset = SIZE_MAX; -+ iter->pos = SIZE_MAX; -+ return; -+ } -+ - iter->offset += obj_size; - - if (!is_power_of_2(obj_size) && -@@ -196,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, - #define genradix_iter_advance(_iter, _radix) \ - __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) - -+static inline void __genradix_iter_rewind(struct genradix_iter *iter, -+ size_t obj_size) -+{ -+ if (iter->offset == 0 || -+ iter->offset == SIZE_MAX) { -+ iter->offset = SIZE_MAX; -+ return; -+ } -+ -+ if ((iter->offset & (PAGE_SIZE - 1)) == 0) -+ iter->offset -= PAGE_SIZE % obj_size; -+ -+ iter->offset -= obj_size; -+ iter->pos--; -+} -+ -+#define genradix_iter_rewind(_iter, _radix) \ -+ __genradix_iter_rewind(_iter, __genradix_obj_size(_radix)) -+ - #define genradix_for_each_from(_radix, _iter, _p, _start) \ - for (_iter = genradix_iter_init(_radix, _start); \ - (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ -@@ -213,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, - #define genradix_for_each(_radix, _iter, _p) \ - genradix_for_each_from(_radix, _iter, _p, 0) - -+#define genradix_last_pos(_radix) \ -+ (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) -+ -+/** -+ * genradix_for_each_reverse - iterate over entry in a genradix, reverse order -+ * @_radix: genradix to iterate over -+ * @_iter: a genradix_iter to track current position -+ * @_p: pointer to genradix entry type -+ * -+ * On every iteration, @_p will point to the current entry, and @_iter.pos -+ * will be the current entry's index. -+ */ -+#define genradix_for_each_reverse(_radix, _iter, _p) \ -+ for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\ -+ (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\ -+ genradix_iter_rewind(&_iter, _radix)) -+ - int __genradix_prealloc(struct __genradix *, size_t, gfp_t); - - /** -diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h -index 6583a5867..3fbe62476 100644 ---- a/include/linux/gfp_types.h -+++ b/include/linux/gfp_types.h -@@ -21,44 +21,78 @@ typedef unsigned int __bitwise gfp_t; - * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c - */ - -+enum { -+ ___GFP_DMA_BIT, -+ ___GFP_HIGHMEM_BIT, -+ ___GFP_DMA32_BIT, -+ ___GFP_MOVABLE_BIT, -+ ___GFP_RECLAIMABLE_BIT, -+ ___GFP_HIGH_BIT, -+ ___GFP_IO_BIT, -+ ___GFP_FS_BIT, -+ ___GFP_ZERO_BIT, -+ ___GFP_UNUSED_BIT, /* 0x200u unused */ -+ ___GFP_DIRECT_RECLAIM_BIT, -+ ___GFP_KSWAPD_RECLAIM_BIT, -+ ___GFP_WRITE_BIT, -+ ___GFP_NOWARN_BIT, -+ ___GFP_RETRY_MAYFAIL_BIT, -+ ___GFP_NOFAIL_BIT, -+ ___GFP_NORETRY_BIT, -+ ___GFP_MEMALLOC_BIT, -+ ___GFP_COMP_BIT, -+ ___GFP_NOMEMALLOC_BIT, -+ ___GFP_HARDWALL_BIT, -+ ___GFP_THISNODE_BIT, -+ ___GFP_ACCOUNT_BIT, -+ ___GFP_ZEROTAGS_BIT, -+#ifdef CONFIG_KASAN_HW_TAGS -+ ___GFP_SKIP_ZERO_BIT, -+ ___GFP_SKIP_KASAN_BIT, -+#endif -+#ifdef CONFIG_LOCKDEP -+ ___GFP_NOLOCKDEP_BIT, -+#endif -+ ___GFP_LAST_BIT -+}; -+ - /* Plain integer GFP bitmasks. Do not use this directly. */ --#define ___GFP_DMA 0x01u --#define ___GFP_HIGHMEM 0x02u --#define ___GFP_DMA32 0x04u --#define ___GFP_MOVABLE 0x08u --#define ___GFP_RECLAIMABLE 0x10u --#define ___GFP_HIGH 0x20u --#define ___GFP_IO 0x40u --#define ___GFP_FS 0x80u --#define ___GFP_ZERO 0x100u -+#define ___GFP_DMA BIT(___GFP_DMA_BIT) -+#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) -+#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) -+#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) -+#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) -+#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) -+#define ___GFP_IO BIT(___GFP_IO_BIT) -+#define ___GFP_FS BIT(___GFP_FS_BIT) -+#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) - /* 0x200u unused */ --#define ___GFP_DIRECT_RECLAIM 0x400u --#define ___GFP_KSWAPD_RECLAIM 0x800u --#define ___GFP_WRITE 0x1000u --#define ___GFP_NOWARN 0x2000u --#define ___GFP_RETRY_MAYFAIL 0x4000u --#define ___GFP_NOFAIL 0x8000u --#define ___GFP_NORETRY 0x10000u --#define ___GFP_MEMALLOC 0x20000u --#define ___GFP_COMP 0x40000u --#define ___GFP_NOMEMALLOC 0x80000u --#define ___GFP_HARDWALL 0x100000u --#define ___GFP_THISNODE 0x200000u --#define ___GFP_ACCOUNT 0x400000u --#define ___GFP_ZEROTAGS 0x800000u -+#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) -+#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) -+#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) -+#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) -+#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) -+#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) -+#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) -+#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) -+#define ___GFP_COMP BIT(___GFP_COMP_BIT) -+#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) -+#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) -+#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) -+#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) -+#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) - #ifdef CONFIG_KASAN_HW_TAGS --#define ___GFP_SKIP_ZERO 0x1000000u --#define ___GFP_SKIP_KASAN 0x2000000u -+#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) -+#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) - #else - #define ___GFP_SKIP_ZERO 0 - #define ___GFP_SKIP_KASAN 0 - #endif - #ifdef CONFIG_LOCKDEP --#define ___GFP_NOLOCKDEP 0x4000000u -+#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) - #else - #define ___GFP_NOLOCKDEP 0 - #endif --/* If the above are modified, __GFP_BITS_SHIFT may need updating */ - - /* - * Physical address zone modifiers (see linux/mmzone.h - low four bits) -@@ -249,7 +283,7 @@ typedef unsigned int __bitwise gfp_t; - #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) - - /* Room for N __GFP_FOO bits */ --#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) -+#define __GFP_BITS_SHIFT ___GFP_LAST_BIT - #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) - - /** -diff --git a/include/linux/iomap.h b/include/linux/iomap.h -index e2b836c2e..a774d074b 100644 ---- a/include/linux/iomap.h -+++ b/include/linux/iomap.h -@@ -60,6 +60,7 @@ struct vm_fault; - #define IOMAP_F_MERGED (1U << 3) - #define IOMAP_F_BUFFER_HEAD (1U << 4) - #define IOMAP_F_XATTR (1U << 5) -+#define IOMAP_F_NOSUBMIT (1U << 6) - - /* - * Flags set by the core iomap code during operations: -diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h -index ae1b54144..8ee2bf5af 100644 ---- a/include/linux/list_bl.h -+++ b/include/linux/list_bl.h -@@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) - } - } - -+/** -+ * hlist_bl_add_fake - create a fake list consisting of a single headless node -+ * @n: Node to make a fake list out of -+ * -+ * This makes @n appear to be its own predecessor on a headless hlist. -+ * The point of this is to allow things like hlist_bl_del() to work correctly -+ * in cases where there is no list. -+ */ -+static inline void hlist_bl_add_fake(struct hlist_bl_node *n) -+{ -+ n->pprev = &n->next; -+} -+ -+/** -+ * hlist_fake: Is this node a fake hlist_bl? -+ * @h: Node to check for being a self-referential fake hlist. -+ */ -+static inline bool hlist_bl_fake(struct hlist_bl_node *n) -+{ -+ return n->pprev == &n->next; -+} -+ - static inline void hlist_bl_lock(struct hlist_bl_head *b) - { - bit_spin_lock(0, (unsigned long *)b); -diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h -index 310f85903..2fdfd9129 100644 ---- a/include/linux/lockdep.h -+++ b/include/linux/lockdep.h -@@ -344,6 +344,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); - #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) - #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) - -+int lock_class_is_held(struct lock_class_key *key); -+ - /* - * Must use lock_map_aquire_try() with override maps to avoid - * lockdep thinking they participate in the block chain. -@@ -442,6 +444,8 @@ extern int lockdep_is_held(const void *); - #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) - #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) - -+static inline int lock_class_is_held(struct lock_class_key *key) { return 0; } -+ - #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ - struct lockdep_map __maybe_unused _name = {} - -@@ -689,4 +693,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s) - } - #endif - -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+void lockdep_set_no_check_recursion(struct lockdep_map *map); -+#else -+static inline void lockdep_set_no_check_recursion(struct lockdep_map *map) {} -+#endif -+ - #endif /* __LINUX_LOCKDEP_H */ -diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h -index 2ebc323d3..aa6bddac2 100644 ---- a/include/linux/lockdep_types.h -+++ b/include/linux/lockdep_types.h -@@ -137,7 +137,7 @@ struct lock_class { - u8 wait_type_inner; - u8 wait_type_outer; - u8 lock_type; -- /* u8 hole; */ -+ u8 no_check_recursion; - - #ifdef CONFIG_LOCK_STAT - unsigned long contention_point[LOCKSTAT_POINTS]; -diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h -new file mode 100644 -index 000000000..647505010 ---- /dev/null -+++ b/include/linux/mean_and_variance.h -@@ -0,0 +1,198 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef MEAN_AND_VARIANCE_H_ -+#define MEAN_AND_VARIANCE_H_ -+ -+#include -+#include -+#include -+#include -+ -+#define SQRT_U64_MAX 4294967295ULL -+ -+/* -+ * u128_u: u128 user mode, because not all architectures support a real int128 -+ * type -+ */ -+ -+#ifdef __SIZEOF_INT128__ -+ -+typedef struct { -+ unsigned __int128 v; -+} __aligned(16) u128_u; -+ -+static inline u128_u u64_to_u128(u64 a) -+{ -+ return (u128_u) { .v = a }; -+} -+ -+static inline u64 u128_lo(u128_u a) -+{ -+ return a.v; -+} -+ -+static inline u64 u128_hi(u128_u a) -+{ -+ return a.v >> 64; -+} -+ -+static inline u128_u u128_add(u128_u a, u128_u b) -+{ -+ a.v += b.v; -+ return a; -+} -+ -+static inline u128_u u128_sub(u128_u a, u128_u b) -+{ -+ a.v -= b.v; -+ return a; -+} -+ -+static inline u128_u u128_shl(u128_u a, s8 shift) -+{ -+ a.v <<= shift; -+ return a; -+} -+ -+static inline u128_u u128_square(u64 a) -+{ -+ u128_u b = u64_to_u128(a); -+ -+ b.v *= b.v; -+ return b; -+} -+ -+#else -+ -+typedef struct { -+ u64 hi, lo; -+} __aligned(16) u128_u; -+ -+/* conversions */ -+ -+static inline u128_u u64_to_u128(u64 a) -+{ -+ return (u128_u) { .lo = a }; -+} -+ -+static inline u64 u128_lo(u128_u a) -+{ -+ return a.lo; -+} -+ -+static inline u64 u128_hi(u128_u a) -+{ -+ return a.hi; -+} -+ -+/* arithmetic */ -+ -+static inline u128_u u128_add(u128_u a, u128_u b) -+{ -+ u128_u c; -+ -+ c.lo = a.lo + b.lo; -+ c.hi = a.hi + b.hi + (c.lo < a.lo); -+ return c; -+} -+ -+static inline u128_u u128_sub(u128_u a, u128_u b) -+{ -+ u128_u c; -+ -+ c.lo = a.lo - b.lo; -+ c.hi = a.hi - b.hi - (c.lo > a.lo); -+ return c; -+} -+ -+static inline u128_u u128_shl(u128_u i, s8 shift) -+{ -+ u128_u r; -+ -+ r.lo = i.lo << shift; -+ if (shift < 64) -+ r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); -+ else { -+ r.hi = i.lo << (shift - 64); -+ r.lo = 0; -+ } -+ return r; -+} -+ -+static inline u128_u u128_square(u64 i) -+{ -+ u128_u r; -+ u64 h = i >> 32, l = i & U32_MAX; -+ -+ r = u128_shl(u64_to_u128(h*h), 64); -+ r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); -+ r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); -+ r = u128_add(r, u64_to_u128(l*l)); -+ return r; -+} -+ -+#endif -+ -+static inline u128_u u64s_to_u128(u64 hi, u64 lo) -+{ -+ u128_u c = u64_to_u128(hi); -+ -+ c = u128_shl(c, 64); -+ c = u128_add(c, u64_to_u128(lo)); -+ return c; -+} -+ -+u128_u u128_div(u128_u n, u64 d); -+ -+struct mean_and_variance { -+ s64 n; -+ s64 sum; -+ u128_u sum_squares; -+}; -+ -+/* expontentially weighted variant */ -+struct mean_and_variance_weighted { -+ bool init; -+ u8 weight; /* base 2 logarithim */ -+ s64 mean; -+ u64 variance; -+}; -+ -+/** -+ * fast_divpow2() - fast approximation for n / (1 << d) -+ * @n: numerator -+ * @d: the power of 2 denominator. -+ * -+ * note: this rounds towards 0. -+ */ -+static inline s64 fast_divpow2(s64 n, u8 d) -+{ -+ return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; -+} -+ -+/** -+ * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 -+ * and return it. -+ * @s1: the mean_and_variance to update. -+ * @v1: the new sample. -+ * -+ * see linked pdf equation 12. -+ */ -+static inline void -+mean_and_variance_update(struct mean_and_variance *s, s64 v) -+{ -+ s->n++; -+ s->sum += v; -+ s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); -+} -+ -+s64 mean_and_variance_get_mean(struct mean_and_variance s); -+u64 mean_and_variance_get_variance(struct mean_and_variance s1); -+u32 mean_and_variance_get_stddev(struct mean_and_variance s); -+ -+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); -+ -+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); -+ -+#endif // MEAN_AND_VAIRANCE_H_ -diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h -index 8d07116ca..b61438313 100644 ---- a/include/linux/nodemask.h -+++ b/include/linux/nodemask.h -@@ -93,10 +93,10 @@ - #include - #include - #include -+#include - #include - #include - --typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; - extern nodemask_t _unused_nodemask_arg_; - - /** -diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h -new file mode 100644 -index 000000000..84c2f47c4 ---- /dev/null -+++ b/include/linux/nodemask_types.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef __LINUX_NODEMASK_TYPES_H -+#define __LINUX_NODEMASK_TYPES_H -+ -+#include -+ -+typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; -+ -+#endif /* __LINUX_NODEMASK_TYPES_H */ -diff --git a/include/linux/prandom.h b/include/linux/prandom.h -index f2ed5b72b..f7f1e5251 100644 ---- a/include/linux/prandom.h -+++ b/include/linux/prandom.h -@@ -10,7 +10,6 @@ - - #include - #include --#include - #include - - struct rnd_state { -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 609bde814..a82f63541 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -20,7 +20,7 @@ - #include - #include - #include --#include -+#include - #include - #include - #include -@@ -870,6 +870,7 @@ struct task_struct { - - struct mm_struct *mm; - struct mm_struct *active_mm; -+ struct address_space *faults_disabled_mapping; - - int exit_state; - int exit_code; -@@ -1162,7 +1163,7 @@ struct task_struct { - #endif - - #ifdef CONFIG_LOCKDEP --# define MAX_LOCK_DEPTH 48UL -+# define MAX_LOCK_DEPTH 63UL - u64 curr_chain_key; - int lockdep_depth; - unsigned int lockdep_recursion; -diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h -index 515d7fcb9..cc02410f2 100644 ---- a/include/linux/seq_buf.h -+++ b/include/linux/seq_buf.h -@@ -161,4 +161,6 @@ seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); - - void seq_buf_do_printk(struct seq_buf *s, const char *lvl); - -+void seq_buf_human_readable_u64(struct seq_buf *, u64); -+ - #endif /* _LINUX_SEQ_BUF_H */ -diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h -index 224293b2d..a15a45d06 100644 ---- a/include/linux/shrinker.h -+++ b/include/linux/shrinker.h -@@ -5,6 +5,8 @@ - #include - #include - -+struct seq_buf; -+ - /* - * This struct is used to pass information from page reclaim to the shrinkers. - * We consolidate the values for easier extension later. -@@ -61,10 +63,12 @@ struct shrink_control { - * @flags determine the shrinker abilities, like numa awareness - */ - struct shrinker { -+ const char *name; - unsigned long (*count_objects)(struct shrinker *, - struct shrink_control *sc); - unsigned long (*scan_objects)(struct shrinker *, - struct shrink_control *sc); -+ void (*to_text)(struct seq_buf *, struct shrinker *); - - long batch; /* reclaim batch size, 0 = default */ - int seeks; /* seeks to recreate an obj */ -@@ -78,11 +82,13 @@ struct shrinker { - #endif - #ifdef CONFIG_SHRINKER_DEBUG - int debugfs_id; -- const char *name; - struct dentry *debugfs_entry; - #endif - /* objs pending delete, per node */ - atomic_long_t *nr_deferred; -+ -+ atomic_long_t objects_requested_to_free; -+ atomic_long_t objects_freed; - }; - #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ - -@@ -104,6 +110,7 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, - extern void unregister_shrinker(struct shrinker *shrinker); - extern void free_prealloced_shrinker(struct shrinker *shrinker); - extern void synchronize_shrinkers(void); -+void shrinkers_to_text(struct seq_buf *); - - #ifdef CONFIG_SHRINKER_DEBUG - extern int shrinker_debugfs_add(struct shrinker *shrinker); -diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h -index 789ab3004..1cc137402 100644 ---- a/include/linux/string_helpers.h -+++ b/include/linux/string_helpers.h -@@ -17,15 +17,14 @@ static inline bool string_is_terminated(const char *s, int len) - return memchr(s, '\0', len) ? true : false; - } - --/* Descriptions of the types of units to -- * print in */ --enum string_size_units { -- STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ -- STRING_UNITS_2, /* use binary powers of 2^10 */ -+enum string_size_flags { -+ STRING_SIZE_BASE2 = (1 << 0), -+ STRING_SIZE_NOSPACE = (1 << 1), -+ STRING_SIZE_NOBYTES = (1 << 2), - }; - --void string_get_size(u64 size, u64 blk_size, enum string_size_units units, -- char *buf, int len); -+int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, -+ char *buf, int len); - - int parse_int_array_user(const char __user *from, size_t count, int **array); - -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bf..f703116e0 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -85,6 +85,7 @@ struct task_struct init_task - .nr_cpus_allowed= NR_CPUS, - .mm = NULL, - .active_mm = &init_mm, -+ .faults_disabled_mapping = NULL, - .restart_block = { - .fn = do_no_restart_syscall, - }, -diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c -index 111607d91..b6c3a8788 100644 ---- a/kernel/locking/lockdep.c -+++ b/kernel/locking/lockdep.c -@@ -3056,6 +3056,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) - - class = hlock_class(prev); - -+ if (class->no_check_recursion) -+ continue; -+ - if (class->cmp_fn && - class->cmp_fn(prev->instance, next->instance) < 0) - continue; -@@ -3121,6 +3124,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, - return 2; - } - -+ if (hlock_class(prev) == hlock_class(next) && -+ hlock_class(prev)->no_check_recursion) -+ return 2; -+ - if (prev->class_idx == next->class_idx) { - struct lock_class *class = hlock_class(prev); - -@@ -6607,6 +6614,26 @@ void debug_check_no_locks_held(void) - } - EXPORT_SYMBOL_GPL(debug_check_no_locks_held); - -+#ifdef CONFIG_LOCKDEP -+int lock_class_is_held(struct lock_class_key *key) -+{ -+ struct task_struct *curr = current; -+ struct held_lock *hlock; -+ -+ if (unlikely(!debug_locks)) -+ return 0; -+ -+ for (hlock = curr->held_locks; -+ hlock < curr->held_locks + curr->lockdep_depth; -+ hlock++) -+ if (hlock->instance->key == key) -+ return 1; -+ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(lock_class_is_held); -+#endif -+ - #ifdef __KERNEL__ - void debug_show_all_locks(void) - { -@@ -6720,3 +6747,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) - warn_rcu_exit(rcu); - } - EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); -+ -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+void lockdep_set_no_check_recursion(struct lockdep_map *lock) -+{ -+ struct lock_class *class = lock->class_cache[0]; -+ unsigned long flags; -+ -+ raw_local_irq_save(flags); -+ lockdep_recursion_inc(); -+ -+ if (!class) -+ class = register_lock_class(lock, 0, 0); -+ if (class) -+ class->no_check_recursion = true; -+ lockdep_recursion_finish(); -+ raw_local_irq_restore(flags); -+} -+EXPORT_SYMBOL_GPL(lockdep_set_no_check_recursion); -+#endif -diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c -index d973fe604..2deeeca3e 100644 ---- a/kernel/locking/mutex.c -+++ b/kernel/locking/mutex.c -@@ -1126,6 +1126,9 @@ EXPORT_SYMBOL(ww_mutex_lock_interruptible); - #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ - #endif /* !CONFIG_PREEMPT_RT */ - -+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_begin); -+EXPORT_TRACEPOINT_SYMBOL_GPL(contention_end); -+ - /** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec -diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c -index d5610ad52..b752ec5cc 100644 ---- a/kernel/locking/osq_lock.c -+++ b/kernel/locking/osq_lock.c -@@ -203,6 +203,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) - - return false; - } -+EXPORT_SYMBOL_GPL(osq_lock); - - void osq_unlock(struct optimistic_spin_queue *lock) - { -@@ -230,3 +231,4 @@ void osq_unlock(struct optimistic_spin_queue *lock) - if (next) - WRITE_ONCE(next->locked, 1); - } -+EXPORT_SYMBOL_GPL(osq_unlock); -diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c -index 9ed5ce989..4f6582487 100644 ---- a/kernel/stacktrace.c -+++ b/kernel/stacktrace.c -@@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, - put_task_stack(tsk); - return c.len; - } -+EXPORT_SYMBOL_GPL(stack_trace_save_tsk); - - /** - * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array -@@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, - save_stack_trace_tsk(task, &trace); - return trace.nr_entries; - } -+EXPORT_SYMBOL_GPL(stack_trace_save_tsk); - - /** - * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array -diff --git a/lib/Kconfig b/lib/Kconfig -index 5c2da561c..f78bc8b42 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -505,6 +505,9 @@ config ASSOCIATIVE_ARRAY - - for more information. - -+config CLOSURES -+ bool -+ - config HAS_IOMEM - bool - depends on !NO_IOMEM -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index d6798513a..69a3e33d1 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -1710,6 +1710,15 @@ config DEBUG_NOTIFIERS - This is a relatively cheap check but if you care about maximum - performance, say N. - -+config DEBUG_CLOSURES -+ bool "Debug closures (bcache async widgits)" -+ depends on CLOSURES -+ select DEBUG_FS -+ help -+ Keeps all active closures in a linked list and provides a debugfs -+ interface to list them, which makes it possible to see asynchronous -+ operations that get stuck. -+ - config BUG_ON_DATA_CORRUPTION - bool "Trigger a BUG when data corruption is detected" - select DEBUG_LIST -@@ -2196,6 +2205,15 @@ config CPUMASK_KUNIT_TEST - - If unsure, say N. - -+config MEAN_AND_VARIANCE_UNIT_TEST -+ tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS -+ depends on KUNIT -+ select MEAN_AND_VARIANCE -+ default KUNIT_ALL_TESTS -+ help -+ This option enables the kunit tests for mean_and_variance module. -+ If unsure, say N. -+ - config TEST_LIST_SORT - tristate "Linked list sorting test" if !KUNIT_ALL_TESTS - depends on KUNIT -diff --git a/lib/Makefile b/lib/Makefile -index 1ffae65bb..5ac5d72ba 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -254,6 +254,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o - - obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o - -+obj-$(CONFIG_CLOSURES) += closure.o -+ - obj-$(CONFIG_DQL) += dynamic_queue_limits.o - - obj-$(CONFIG_GLOB) += glob.o -diff --git a/drivers/md/bcache/closure.c b/lib/closure.c -similarity index 85% -rename from drivers/md/bcache/closure.c -rename to lib/closure.c -index d8d9394a6..2958169ce 100644 ---- a/drivers/md/bcache/closure.c -+++ b/lib/closure.c -@@ -6,19 +6,20 @@ - * Copyright 2012 Google, Inc. - */ - -+#include - #include --#include -+#include -+#include - #include - #include - --#include "closure.h" -- - static inline void closure_put_after_sub(struct closure *cl, int flags) - { - int r = flags & CLOSURE_REMAINING_MASK; - -- BUG_ON(flags & CLOSURE_GUARD_MASK); -- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -+ if ((flags & CLOSURE_GUARD_MASK) || -+ (!r && (flags & ~CLOSURE_DESTRUCTOR))) -+ panic("closure_put_after_sub: bogus flags %x remaining %i", flags, r); - - if (!r) { - if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -@@ -45,6 +46,7 @@ void closure_sub(struct closure *cl, int v) - { - closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); - } -+EXPORT_SYMBOL(closure_sub); - - /* - * closure_put - decrement a closure's refcount -@@ -53,6 +55,7 @@ void closure_put(struct closure *cl) - { - closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); - } -+EXPORT_SYMBOL(closure_put); - - /* - * closure_wake_up - wake up all closures on a wait list, without memory barrier -@@ -74,6 +77,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) - closure_sub(cl, CLOSURE_WAITING + 1); - } - } -+EXPORT_SYMBOL(__closure_wake_up); - - /** - * closure_wait - add a closure to a waitlist -@@ -93,6 +97,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) - - return true; - } -+EXPORT_SYMBOL(closure_wait); - - struct closure_syncer { - struct task_struct *task; -@@ -127,8 +132,9 @@ void __sched __closure_sync(struct closure *cl) - - __set_current_state(TASK_RUNNING); - } -+EXPORT_SYMBOL(__closure_sync); - --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -+#ifdef CONFIG_DEBUG_CLOSURES - - static LIST_HEAD(closure_list); - static DEFINE_SPINLOCK(closure_list_lock); -@@ -144,6 +150,7 @@ void closure_debug_create(struct closure *cl) - list_add(&cl->all, &closure_list); - spin_unlock_irqrestore(&closure_list_lock, flags); - } -+EXPORT_SYMBOL(closure_debug_create); - - void closure_debug_destroy(struct closure *cl) - { -@@ -156,8 +163,7 @@ void closure_debug_destroy(struct closure *cl) - list_del(&cl->all); - spin_unlock_irqrestore(&closure_list_lock, flags); - } -- --static struct dentry *closure_debug; -+EXPORT_SYMBOL(closure_debug_destroy); - - static int debug_show(struct seq_file *f, void *data) - { -@@ -181,7 +187,7 @@ static int debug_show(struct seq_file *f, void *data) - seq_printf(f, " W %pS\n", - (void *) cl->waiting_on); - -- seq_printf(f, "\n"); -+ seq_puts(f, "\n"); - } - - spin_unlock_irq(&closure_list_lock); -@@ -190,18 +196,11 @@ static int debug_show(struct seq_file *f, void *data) - - DEFINE_SHOW_ATTRIBUTE(debug); - --void __init closure_debug_init(void) -+static int __init closure_debug_init(void) - { -- if (!IS_ERR_OR_NULL(bcache_debug)) -- /* -- * it is unnecessary to check return value of -- * debugfs_create_file(), we should not care -- * about this. -- */ -- closure_debug = debugfs_create_file( -- "closures", 0400, bcache_debug, NULL, &debug_fops); -+ debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); -+ return 0; - } --#endif -+late_initcall(closure_debug_init) - --MODULE_AUTHOR("Kent Overstreet "); --MODULE_LICENSE("GPL"); -+#endif -diff --git a/lib/errname.c b/lib/errname.c -index 67739b174..dd1b99855 100644 ---- a/lib/errname.c -+++ b/lib/errname.c -@@ -228,3 +228,4 @@ const char *errname(int err) - - return err > 0 ? name + 1 : name; - } -+EXPORT_SYMBOL(errname); -diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c -index f25eb111c..41f1bcdc4 100644 ---- a/lib/generic-radix-tree.c -+++ b/lib/generic-radix-tree.c -@@ -1,4 +1,5 @@ - -+#include - #include - #include - #include -@@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter, - struct genradix_root *r; - struct genradix_node *n; - unsigned level, i; -+ -+ if (iter->offset == SIZE_MAX) -+ return NULL; -+ - restart: - r = READ_ONCE(radix->root); - if (!r) -@@ -184,10 +189,17 @@ void *__genradix_iter_peek(struct genradix_iter *iter, - (GENRADIX_ARY - 1); - - while (!n->children[i]) { -+ size_t objs_per_ptr = genradix_depth_size(level); -+ -+ if (iter->offset + objs_per_ptr < iter->offset) { -+ iter->offset = SIZE_MAX; -+ iter->pos = SIZE_MAX; -+ return NULL; -+ } -+ - i++; -- iter->offset = round_down(iter->offset + -- genradix_depth_size(level), -- genradix_depth_size(level)); -+ iter->offset = round_down(iter->offset + objs_per_ptr, -+ objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * - objs_per_page; - if (i == GENRADIX_ARY) -@@ -201,6 +213,64 @@ void *__genradix_iter_peek(struct genradix_iter *iter, - } - EXPORT_SYMBOL(__genradix_iter_peek); - -+void *__genradix_iter_peek_prev(struct genradix_iter *iter, -+ struct __genradix *radix, -+ size_t objs_per_page, -+ size_t obj_size_plus_page_remainder) -+{ -+ struct genradix_root *r; -+ struct genradix_node *n; -+ unsigned level, i; -+ -+ if (iter->offset == SIZE_MAX) -+ return NULL; -+ -+restart: -+ r = READ_ONCE(radix->root); -+ if (!r) -+ return NULL; -+ -+ n = genradix_root_to_node(r); -+ level = genradix_root_to_depth(r); -+ -+ if (ilog2(iter->offset) >= genradix_depth_shift(level)) { -+ iter->offset = genradix_depth_size(level); -+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; -+ -+ iter->offset -= obj_size_plus_page_remainder; -+ iter->pos--; -+ } -+ -+ while (level) { -+ level--; -+ -+ i = (iter->offset >> genradix_depth_shift(level)) & -+ (GENRADIX_ARY - 1); -+ -+ while (!n->children[i]) { -+ size_t objs_per_ptr = genradix_depth_size(level); -+ -+ iter->offset = round_down(iter->offset, objs_per_ptr); -+ iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; -+ -+ if (!iter->offset) -+ return NULL; -+ -+ iter->offset -= obj_size_plus_page_remainder; -+ iter->pos--; -+ -+ if (!i) -+ goto restart; -+ --i; -+ } -+ -+ n = n->children[i]; -+ } -+ -+ return &n->data[iter->offset & (PAGE_SIZE - 1)]; -+} -+EXPORT_SYMBOL(__genradix_iter_peek_prev); -+ - static void genradix_free_recurse(struct genradix_node *n, unsigned level) - { - if (level) { -diff --git a/lib/iov_iter.c b/lib/iov_iter.c -index e4dc809d1..eb3dffb24 100644 ---- a/lib/iov_iter.c -+++ b/lib/iov_iter.c -@@ -566,24 +566,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) - } - EXPORT_SYMBOL(iov_iter_zero); - --size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, -- struct iov_iter *i) -+size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, -+ size_t bytes, struct iov_iter *i) - { -- char *kaddr = kmap_atomic(page), *p = kaddr + offset; -- if (!page_copy_sane(page, offset, bytes)) { -- kunmap_atomic(kaddr); -+ size_t n, copied = 0; -+ -+ if (!page_copy_sane(page, offset, bytes)) - return 0; -- } -- if (WARN_ON_ONCE(!i->data_source)) { -- kunmap_atomic(kaddr); -+ if (WARN_ON_ONCE(!i->data_source)) - return 0; -- } -- iterate_and_advance(i, bytes, base, len, off, -- copyin(p + off, base, len), -- memcpy_from_iter(i, p + off, base, len) -- ) -- kunmap_atomic(kaddr); -- return bytes; -+ -+ do { -+ char *p; -+ -+ n = bytes - copied; -+ if (PageHighMem(page)) { -+ page += offset / PAGE_SIZE; -+ offset %= PAGE_SIZE; -+ n = min_t(size_t, n, PAGE_SIZE - offset); -+ } -+ -+ p = kmap_atomic(page) + offset; -+ iterate_and_advance(i, n, base, len, off, -+ copyin(p + off, base, len), -+ memcpy_from_iter(i, p + off, base, len) -+ ) -+ kunmap_atomic(p); -+ copied += n; -+ offset += n; -+ } while (PageHighMem(page) && copied != bytes && n > 0); -+ -+ return copied; - } - EXPORT_SYMBOL(copy_page_from_iter_atomic); - -diff --git a/lib/math/Kconfig b/lib/math/Kconfig -index 0634b428d..7530ae9a3 100644 ---- a/lib/math/Kconfig -+++ b/lib/math/Kconfig -@@ -15,3 +15,6 @@ config PRIME_NUMBERS - - config RATIONAL - tristate -+ -+config MEAN_AND_VARIANCE -+ tristate -diff --git a/lib/math/Makefile b/lib/math/Makefile -index bfac26ddf..2ef1487e0 100644 ---- a/lib/math/Makefile -+++ b/lib/math/Makefile -@@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o - obj-$(CONFIG_CORDIC) += cordic.o - obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o - obj-$(CONFIG_RATIONAL) += rational.o -+obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o - - obj-$(CONFIG_TEST_DIV64) += test_div64.o - obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o -+obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o -diff --git a/lib/math/mean_and_variance.c b/lib/math/mean_and_variance.c -new file mode 100644 -index 000000000..eb5f2ba03 ---- /dev/null -+++ b/lib/math/mean_and_variance.c -@@ -0,0 +1,158 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Functions for incremental mean and variance. -+ * -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License version 2 as published by -+ * the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, but WITHOUT -+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -+ * more details. -+ * -+ * Copyright © 2022 Daniel B. Hill -+ * -+ * Author: Daniel B. Hill -+ * -+ * Description: -+ * -+ * This is includes some incremental algorithms for mean and variance calculation -+ * -+ * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf -+ * -+ * Create a struct and if it's the weighted variant set the w field (weight = 2^k). -+ * -+ * Use mean_and_variance[_weighted]_update() on the struct to update it's state. -+ * -+ * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation -+ * is deferred to these functions for performance reasons. -+ * -+ * see lib/math/mean_and_variance_test.c for examples of usage. -+ * -+ * DO NOT access the mean and variance fields of the weighted variants directly. -+ * DO NOT change the weight after calling update. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+u128_u u128_div(u128_u n, u64 d) -+{ -+ u128_u r; -+ u64 rem; -+ u64 hi = u128_hi(n); -+ u64 lo = u128_lo(n); -+ u64 h = hi & ((u64) U32_MAX << 32); -+ u64 l = (hi & (u64) U32_MAX) << 32; -+ -+ r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); -+ r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); -+ r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); -+ return r; -+} -+EXPORT_SYMBOL_GPL(u128_div); -+ -+/** -+ * mean_and_variance_get_mean() - get mean from @s -+ */ -+s64 mean_and_variance_get_mean(struct mean_and_variance s) -+{ -+ return s.n ? div64_u64(s.sum, s.n) : 0; -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); -+ -+/** -+ * mean_and_variance_get_variance() - get variance from @s1 -+ * -+ * see linked pdf equation 12. -+ */ -+u64 mean_and_variance_get_variance(struct mean_and_variance s1) -+{ -+ if (s1.n) { -+ u128_u s2 = u128_div(s1.sum_squares, s1.n); -+ u64 s3 = abs(mean_and_variance_get_mean(s1)); -+ -+ return u128_lo(u128_sub(s2, u128_square(s3))); -+ } else { -+ return 0; -+ } -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); -+ -+/** -+ * mean_and_variance_get_stddev() - get standard deviation from @s -+ */ -+u32 mean_and_variance_get_stddev(struct mean_and_variance s) -+{ -+ return int_sqrt64(mean_and_variance_get_variance(s)); -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); -+ -+/** -+ * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() -+ * @s1: .. -+ * @s2: .. -+ * -+ * see linked pdf: function derived from equations 140-143 where alpha = 2^w. -+ * values are stored bitshifted for performance and added precision. -+ */ -+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) -+{ -+ // previous weighted variance. -+ u8 w = s->weight; -+ u64 var_w0 = s->variance; -+ // new value weighted. -+ s64 x_w = x << w; -+ s64 diff_w = x_w - s->mean; -+ s64 diff = fast_divpow2(diff_w, w); -+ // new mean weighted. -+ s64 u_w1 = s->mean + diff; -+ -+ if (!s->init) { -+ s->mean = x_w; -+ s->variance = 0; -+ } else { -+ s->mean = u_w1; -+ s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; -+ } -+ s->init = true; -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); -+ -+/** -+ * mean_and_variance_weighted_get_mean() - get mean from @s -+ */ -+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) -+{ -+ return fast_divpow2(s.mean, s.weight); -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); -+ -+/** -+ * mean_and_variance_weighted_get_variance() -- get variance from @s -+ */ -+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) -+{ -+ // always positive don't need fast divpow2 -+ return s.variance >> s.weight; -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); -+ -+/** -+ * mean_and_variance_weighted_get_stddev() - get standard deviation from @s -+ */ -+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) -+{ -+ return int_sqrt64(mean_and_variance_weighted_get_variance(s)); -+} -+EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); -+ -+MODULE_AUTHOR("Daniel B. Hill"); -+MODULE_LICENSE("GPL"); -diff --git a/lib/math/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c -new file mode 100644 -index 000000000..f45591a16 ---- /dev/null -+++ b/lib/math/mean_and_variance_test.c -@@ -0,0 +1,239 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include -+#include -+ -+#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) -+ -+static void mean_and_variance_basic_test(struct kunit *test) -+{ -+ struct mean_and_variance s = {}; -+ -+ mean_and_variance_update(&s, 2); -+ mean_and_variance_update(&s, 2); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); -+ KUNIT_EXPECT_EQ(test, s.n, 2); -+ -+ mean_and_variance_update(&s, 4); -+ mean_and_variance_update(&s, 4); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); -+ KUNIT_EXPECT_EQ(test, s.n, 4); -+} -+ -+/* -+ * Test values computed using a spreadsheet from the psuedocode at the bottom: -+ * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf -+ */ -+ -+static void mean_and_variance_weighted_test(struct kunit *test) -+{ -+ struct mean_and_variance_weighted s = { .weight = 2 }; -+ -+ mean_and_variance_weighted_update(&s, 10); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); -+ -+ mean_and_variance_weighted_update(&s, 20); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); -+ -+ mean_and_variance_weighted_update(&s, 30); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); -+ -+ s = (struct mean_and_variance_weighted) { .weight = 2 }; -+ -+ mean_and_variance_weighted_update(&s, -10); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); -+ -+ mean_and_variance_weighted_update(&s, -20); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); -+ -+ mean_and_variance_weighted_update(&s, -30); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); -+} -+ -+static void mean_and_variance_weighted_advanced_test(struct kunit *test) -+{ -+ struct mean_and_variance_weighted s = { .weight = 8 }; -+ s64 i; -+ -+ for (i = 10; i <= 100; i += 10) -+ mean_and_variance_weighted_update(&s, i); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); -+ -+ s = (struct mean_and_variance_weighted) { .weight = 8 }; -+ -+ for (i = -10; i >= -100; i -= 10) -+ mean_and_variance_weighted_update(&s, i); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); -+} -+ -+static void do_mean_and_variance_test(struct kunit *test, -+ s64 initial_value, -+ s64 initial_n, -+ s64 n, -+ unsigned weight, -+ s64 *data, -+ s64 *mean, -+ s64 *stddev, -+ s64 *weighted_mean, -+ s64 *weighted_stddev) -+{ -+ struct mean_and_variance mv = {}; -+ struct mean_and_variance_weighted vw = { .weight = weight }; -+ -+ for (unsigned i = 0; i < initial_n; i++) { -+ mean_and_variance_update(&mv, initial_value); -+ mean_and_variance_weighted_update(&vw, initial_value); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); -+ } -+ -+ for (unsigned i = 0; i < n; i++) { -+ mean_and_variance_update(&mv, data[i]); -+ mean_and_variance_weighted_update(&vw, data[i]); -+ -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); -+ KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); -+ } -+ -+ KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); -+} -+ -+/* Test behaviour with a single outlier, then back to steady state: */ -+static void mean_and_variance_test_1(struct kunit *test) -+{ -+ s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; -+ s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; -+ s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; -+ s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; -+ s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; -+ -+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, -+ d, mean, stddev, weighted_mean, weighted_stddev); -+} -+ -+static void mean_and_variance_test_2(struct kunit *test) -+{ -+ s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; -+ s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; -+ s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; -+ s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; -+ s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; -+ -+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, -+ d, mean, stddev, weighted_mean, weighted_stddev); -+} -+ -+/* Test behaviour where we switch from one steady state to another: */ -+static void mean_and_variance_test_3(struct kunit *test) -+{ -+ s64 d[] = { 100, 100, 100, 100, 100 }; -+ s64 mean[] = { 22, 32, 40, 46, 50 }; -+ s64 stddev[] = { 32, 39, 42, 44, 45 }; -+ s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; -+ s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; -+ -+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, -+ d, mean, stddev, weighted_mean, weighted_stddev); -+} -+ -+static void mean_and_variance_test_4(struct kunit *test) -+{ -+ s64 d[] = { 100, 100, 100, 100, 100 }; -+ s64 mean[] = { 10, 11, 12, 13, 14 }; -+ s64 stddev[] = { 9, 13, 15, 17, 19 }; -+ s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; -+ s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; -+ -+ do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, -+ d, mean, stddev, weighted_mean, weighted_stddev); -+} -+ -+static void mean_and_variance_fast_divpow2(struct kunit *test) -+{ -+ s64 i; -+ u8 d; -+ -+ for (i = 0; i < 100; i++) { -+ d = 0; -+ KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); -+ KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); -+ for (d = 1; d < 32; d++) { -+ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), -+ div_u64(i, 1 << d), "%lld %u", i, d); -+ KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), -+ div_u64(i, 1 << d), "%lld %u", -i, d); -+ } -+ } -+} -+ -+static void mean_and_variance_u128_basic_test(struct kunit *test) -+{ -+ u128_u a = u64s_to_u128(0, U64_MAX); -+ u128_u a1 = u64s_to_u128(0, 1); -+ u128_u b = u64s_to_u128(1, 0); -+ u128_u c = u64s_to_u128(0, 1LLU << 63); -+ u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); -+ -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); -+ -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); -+ -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); -+ -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); -+ -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); -+ -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); -+ -+ KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); -+ KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); -+} -+ -+static struct kunit_case mean_and_variance_test_cases[] = { -+ KUNIT_CASE(mean_and_variance_fast_divpow2), -+ KUNIT_CASE(mean_and_variance_u128_basic_test), -+ KUNIT_CASE(mean_and_variance_basic_test), -+ KUNIT_CASE(mean_and_variance_weighted_test), -+ KUNIT_CASE(mean_and_variance_weighted_advanced_test), -+ KUNIT_CASE(mean_and_variance_test_1), -+ KUNIT_CASE(mean_and_variance_test_2), -+ KUNIT_CASE(mean_and_variance_test_3), -+ KUNIT_CASE(mean_and_variance_test_4), -+ {} -+}; -+ -+static struct kunit_suite mean_and_variance_test_suite = { -+ .name = "mean and variance tests", -+ .test_cases = mean_and_variance_test_cases -+}; -+ -+kunit_test_suite(mean_and_variance_test_suite); -+ -+MODULE_AUTHOR("Daniel B. Hill"); -+MODULE_LICENSE("GPL"); -diff --git a/lib/rhashtable.c b/lib/rhashtable.c -index 6ae2ba8e0..d3fce9c89 100644 ---- a/lib/rhashtable.c -+++ b/lib/rhashtable.c -@@ -360,9 +360,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, - - ASSERT_RHT_MUTEX(ht); - -- new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); -- if (new_tbl == NULL) -+ new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL|__GFP_NOWARN); -+ if (new_tbl == NULL) { -+ WARN("rhashtable bucket table allocation failure for %ps", -+ (void *) ht->p.hashfn ?: -+ (void *) ht->p.obj_hashfn ?: -+ (void *) ht->p.obj_cmpfn); - return -ENOMEM; -+ } - - err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); - if (err) -diff --git a/lib/seq_buf.c b/lib/seq_buf.c -index 45c450f42..2b87e9219 100644 ---- a/lib/seq_buf.c -+++ b/lib/seq_buf.c -@@ -427,3 +427,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, - } - return 0; - } -+ -+void seq_buf_human_readable_u64(struct seq_buf *s, u64 v) -+{ -+ char *buf; -+ size_t size = seq_buf_get_buf(s, &buf); -+ int wrote = string_get_size(v, 1, false, buf, size); -+ -+ seq_buf_commit(s, wrote); -+} -+EXPORT_SYMBOL(seq_buf_human_readable_u64); -diff --git a/lib/string_helpers.c b/lib/string_helpers.c -index d3b1dd718..c29dd105b 100644 ---- a/lib/string_helpers.c -+++ b/lib/string_helpers.c -@@ -19,11 +19,17 @@ - #include - #include - -+enum string_size_units { -+ STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ -+ STRING_UNITS_2, /* use binary powers of 2^10 */ -+}; -+ - /** - * string_get_size - get the size in the specified units - * @size: The size to be converted in blocks - * @blk_size: Size of the block (use 1 for size in bytes) -- * @units: units to use (powers of 1000 or 1024) -+ * @flags: units to use (powers of 1000 or 1024), whether to include space -+ * separator - * @buf: buffer to format to - * @len: length of buffer - * -@@ -31,15 +37,19 @@ - * giving the size in the required units. @buf should have room for - * at least 9 bytes and will always be zero terminated. - * -+ * Return value: number of characters of output that would have been written -+ * (which may be greater than len, if output was truncated). - */ --void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, -- char *buf, int len) -+int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, -+ char *buf, int len) - { -+ enum string_size_units units = flags & flags & STRING_SIZE_BASE2 -+ ? STRING_UNITS_2 : STRING_UNITS_10; - static const char *const units_10[] = { -- "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -+ "", "k", "M", "G", "T", "P", "E", "Z", "Y" - }; - static const char *const units_2[] = { -- "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" -+ "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" - }; - static const char *const *const units_str[] = { - [STRING_UNITS_10] = units_10, -@@ -126,8 +136,10 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, - else - unit = units_str[units][i]; - -- snprintf(buf, len, "%u%s %s", (u32)size, -- tmp, unit); -+ return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp, -+ (flags & STRING_SIZE_NOSPACE) ? "" : " ", -+ unit, -+ (flags & STRING_SIZE_NOBYTES) ? "" : "B"); - } - EXPORT_SYMBOL(string_get_size); - -diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c -index 9a68849a5..0b01ffca9 100644 ---- a/lib/test-string_helpers.c -+++ b/lib/test-string_helpers.c -@@ -507,8 +507,8 @@ static __init void __test_string_get_size(const u64 size, const u64 blk_size, - char buf10[string_get_size_maxbuf]; - char buf2[string_get_size_maxbuf]; - -- string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10)); -- string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2)); -+ string_get_size(size, blk_size, 0, buf10, sizeof(buf10)); -+ string_get_size(size, blk_size, STRING_SIZE_BASE2, buf2, sizeof(buf2)); - - test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10, - size, blk_size); -diff --git a/mm/hugetlb.c b/mm/hugetlb.c -index 6da626bfb..4165e22b0 100644 ---- a/mm/hugetlb.c -+++ b/mm/hugetlb.c -@@ -3270,7 +3270,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) - if (i == h->max_huge_pages_node[nid]) - return; - -- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); -+ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); - pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", - h->max_huge_pages_node[nid], buf, nid, i); - h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); -@@ -3332,7 +3332,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) - if (i < h->max_huge_pages) { - char buf[32]; - -- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); -+ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); - pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", - h->max_huge_pages, buf, i); - h->max_huge_pages = i; -@@ -3378,7 +3378,7 @@ static void __init report_hugepages(void) - for_each_hstate(h) { - char buf[32]; - -- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); -+ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); - pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", - buf, h->free_huge_pages); - pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", -@@ -4269,7 +4269,7 @@ static int __init hugetlb_init(void) - char buf[32]; - - string_get_size(huge_page_size(&default_hstate), -- 1, STRING_UNITS_2, buf, 32); -+ 1, STRING_SIZE_BASE2, buf, 32); - pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", - default_hstate.max_huge_pages, buf); - pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", -diff --git a/mm/madvise.c b/mm/madvise.c -index ec30f48f8..fa2f140d0 100644 ---- a/mm/madvise.c -+++ b/mm/madvise.c -@@ -1330,6 +1330,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, - madvise_vma_anon_name); - } - #endif /* CONFIG_ANON_VMA_NAME */ -+ -+static noinline unsigned long test_alloc(unsigned long in1, unsigned long in2, size_t size) -+{ -+ switch (in1) -+ { -+ case (1): -+ return __get_free_pages(GFP_KERNEL, 0); -+ case (2): -+ return (unsigned long)kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT); -+ default: -+ printk("test_alloc invoked with args in1=%lu in2=%lu\n", -+ in1, in2); -+ return 0; -+ } -+} -+ -+static noinline void test_free(unsigned long in1, unsigned long in2, unsigned long addr) -+{ -+ switch (in1) -+ { -+ case (1): -+ free_page(addr); -+ break; -+ case (2): -+ kfree((void*)addr); -+ break; -+ default: -+ printk("test_free invoked with args in1=%lu in2=%lu\n", -+ in1, in2); -+ break; -+ } -+} -+ -+#define MADV_TEST 25 -+static noinline int alloc_bench(unsigned long in1, unsigned long in2) -+{ -+ int i, batch, iter; -+ unsigned long addr[10]; -+ -+ for (iter = 0; iter < 1000000; iter++) { -+ size_t size = 8; -+ for (batch = 0; batch < 30; batch++) { -+ for (i = 0; i < 10; i++) { -+ addr[i] = test_alloc(in1, in2, size); -+ } -+ for (i = 0; i < 10; i++) { -+ test_free(in1, in2, addr[i]); -+ } -+ size += 8; -+ } -+ if (fatal_signal_pending(current)) -+ return -EINTR; -+ //cond_resched(); -+ } -+ -+ return 0; -+} -+ - /* - * The madvise(2) system call. - * -@@ -1409,6 +1467,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh - size_t len; - struct blk_plug plug; - -+ if (behavior == MADV_TEST) -+ return alloc_bench(start, len_in); -+ - if (!madvise_behavior_valid(behavior)) - return -EINVAL; - -diff --git a/mm/oom_kill.c b/mm/oom_kill.c -index 612b5597d..467cff51f 100644 ---- a/mm/oom_kill.c -+++ b/mm/oom_kill.c -@@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p) - return false; - } - --/* -- * Check whether unreclaimable slab amount is greater than -- * all user memory(LRU pages). -- * dump_unreclaimable_slab() could help in the case that -- * oom due to too much unreclaimable slab used by kernel. --*/ --static bool should_dump_unreclaim_slab(void) --{ -- unsigned long nr_lru; -- -- nr_lru = global_node_page_state(NR_ACTIVE_ANON) + -- global_node_page_state(NR_INACTIVE_ANON) + -- global_node_page_state(NR_ACTIVE_FILE) + -- global_node_page_state(NR_INACTIVE_FILE) + -- global_node_page_state(NR_ISOLATED_ANON) + -- global_node_page_state(NR_ISOLATED_FILE) + -- global_node_page_state(NR_UNEVICTABLE); -- -- return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); --} -- - /** - * oom_badness - heuristic function to determine which candidate task to kill - * @p: task struct of which task we should calculate -@@ -462,8 +441,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) - mem_cgroup_print_oom_meminfo(oc->memcg); - else { - __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); -- if (should_dump_unreclaim_slab()) -- dump_unreclaimable_slab(); - } - if (sysctl_oom_dump_tasks) - dump_tasks(oc); -diff --git a/mm/show_mem.c b/mm/show_mem.c -index 01f8e9905..94ebd86c8 100644 ---- a/mm/show_mem.c -+++ b/mm/show_mem.c -@@ -12,10 +12,12 @@ - #include - #include - #include -+#include - #include - #include - - #include "internal.h" -+#include "slab.h" - #include "swap.h" - - atomic_long_t _totalram_pages __read_mostly; -@@ -404,6 +406,7 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) - { - unsigned long total = 0, reserved = 0, highmem = 0; - struct zone *zone; -+ char *buf; - - printk("Mem-Info:\n"); - __show_free_areas(filter, nodemask, max_zone_idx); -@@ -426,4 +429,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) - #ifdef CONFIG_MEMORY_FAILURE - printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); - #endif -+ -+ buf = kmalloc(4096, GFP_ATOMIC); -+ if (buf) { -+ struct seq_buf s; -+ -+ printk("Unreclaimable slab info:\n"); -+ seq_buf_init(&s, buf, 4096); -+ dump_unreclaimable_slab(&s); -+ seq_buf_terminate(&s); -+ printk("%s", buf); -+ -+ printk("Shrinkers:\n"); -+ seq_buf_init(&s, buf, 4096); -+ shrinkers_to_text(&s); -+ seq_buf_terminate(&s); -+ printk("%s", buf); -+ -+ kfree(buf); -+ } - } -diff --git a/mm/slab.h b/mm/slab.h -index 9c0e09d0f..7bcf32b47 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -817,10 +817,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) - if ((__n = get_node(__s, __node))) - - -+struct seq_buf; -+ - #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) --void dump_unreclaimable_slab(void); -+void dump_unreclaimable_slab(struct seq_buf *); - #else --static inline void dump_unreclaimable_slab(void) -+static inline void dump_unreclaimable_slab(struct seq_buf *out) - { - } - #endif -diff --git a/mm/slab_common.c b/mm/slab_common.c -index d1555ea29..fbd6b879d 100644 ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -26,6 +26,7 @@ - #include - #include - #include -+#include - #include - - #include "internal.h" -@@ -1273,10 +1274,15 @@ static int slab_show(struct seq_file *m, void *p) - return 0; - } - --void dump_unreclaimable_slab(void) -+void dump_unreclaimable_slab(struct seq_buf *out) - { - struct kmem_cache *s; - struct slabinfo sinfo; -+ struct slab_by_mem { -+ struct kmem_cache *s; -+ size_t total, active; -+ } slabs_by_mem[10], n; -+ int i, nr = 0; - - /* - * Here acquiring slab_mutex is risky since we don't prefer to get -@@ -1286,24 +1292,52 @@ void dump_unreclaimable_slab(void) - * without acquiring the mutex. - */ - if (!mutex_trylock(&slab_mutex)) { -- pr_warn("excessive unreclaimable slab but cannot dump stats\n"); -+ seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n"); - return; - } - -- pr_info("Unreclaimable slab info:\n"); -- pr_info("Name Used Total\n"); -- - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & SLAB_RECLAIM_ACCOUNT) - continue; - - get_slabinfo(s, &sinfo); - -- if (sinfo.num_objs > 0) -- pr_info("%-17s %10luKB %10luKB\n", s->name, -- (sinfo.active_objs * s->size) / 1024, -- (sinfo.num_objs * s->size) / 1024); -+ if (!sinfo.num_objs) -+ continue; -+ -+ n.s = s; -+ n.total = sinfo.num_objs * s->size; -+ n.active = sinfo.active_objs * s->size; -+ -+ for (i = 0; i < nr; i++) -+ if (n.total < slabs_by_mem[i].total) -+ break; -+ -+ if (nr < ARRAY_SIZE(slabs_by_mem)) { -+ memmove(&slabs_by_mem[i + 1], -+ &slabs_by_mem[i], -+ sizeof(slabs_by_mem[0]) * (nr - i)); -+ nr++; -+ } else if (i) { -+ i--; -+ memmove(&slabs_by_mem[0], -+ &slabs_by_mem[1], -+ sizeof(slabs_by_mem[0]) * i); -+ } else { -+ continue; -+ } -+ -+ slabs_by_mem[i] = n; -+ } -+ -+ for (i = nr - 1; i >= 0; --i) { -+ seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); -+ seq_buf_human_readable_u64(out, slabs_by_mem[i].total); -+ seq_buf_printf(out, " active: "); -+ seq_buf_human_readable_u64(out, slabs_by_mem[i].active); -+ seq_buf_putc(out, '\n'); - } -+ - mutex_unlock(&slab_mutex); - } - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 445ce9324..19067fa9a 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -57,6 +57,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -702,7 +703,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker) - return 0; - } - --#ifdef CONFIG_SHRINKER_DEBUG - int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) - { - va_list ap; -@@ -722,19 +722,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) - - return err; - } --#else --int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) --{ -- return __prealloc_shrinker(shrinker); --} --#endif - - void free_prealloced_shrinker(struct shrinker *shrinker) - { --#ifdef CONFIG_SHRINKER_DEBUG - kfree_const(shrinker->name); - shrinker->name = NULL; --#endif -+ - if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - down_write(&shrinker_rwsem); - unregister_memcg_shrinker(shrinker); -@@ -765,7 +758,6 @@ static int __register_shrinker(struct shrinker *shrinker) - return 0; - } - --#ifdef CONFIG_SHRINKER_DEBUG - int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) - { - va_list ap; -@@ -784,12 +776,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) - } - return err; - } --#else --int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) --{ -- return __register_shrinker(shrinker); --} --#endif - EXPORT_SYMBOL(register_shrinker); - - /* -@@ -815,6 +801,9 @@ void unregister_shrinker(struct shrinker *shrinker) - - kfree(shrinker->nr_deferred); - shrinker->nr_deferred = NULL; -+ -+ kfree_const(shrinker->name); -+ shrinker->name = NULL; - } - EXPORT_SYMBOL(unregister_shrinker); - -@@ -833,6 +822,80 @@ void synchronize_shrinkers(void) - } - EXPORT_SYMBOL(synchronize_shrinkers); - -+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker) -+{ -+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; -+ -+ seq_buf_puts(out, shrinker->name); -+ seq_buf_putc(out, '\n'); -+ -+ seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc)); -+ seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free)); -+ seq_buf_printf(out, "objects freed: %lu\n", atomic_long_read(&shrinker->objects_freed)); -+ -+ if (shrinker->to_text) { -+ shrinker->to_text(out, shrinker); -+ seq_buf_puts(out, "\n"); -+ } -+} -+ -+/** -+ * shrinkers_to_text - Report on shrinkers with highest usage -+ * -+ * This reports on the top 10 shrinkers, by object counts, in sorted order: -+ * intended to be used for OOM reporting. -+ */ -+void shrinkers_to_text(struct seq_buf *out) -+{ -+ struct shrinker *shrinker; -+ struct shrinker_by_mem { -+ struct shrinker *shrinker; -+ unsigned long mem; -+ } shrinkers_by_mem[10]; -+ int i, nr = 0; -+ -+ if (!down_read_trylock(&shrinker_rwsem)) { -+ seq_buf_puts(out, "(couldn't take shrinker lock)"); -+ return; -+ } -+ -+ list_for_each_entry(shrinker, &shrinker_list, list) { -+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; -+ unsigned long mem = shrinker->count_objects(shrinker, &sc); -+ -+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY) -+ continue; -+ -+ for (i = 0; i < nr; i++) -+ if (mem < shrinkers_by_mem[i].mem) -+ break; -+ -+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) { -+ memmove(&shrinkers_by_mem[i + 1], -+ &shrinkers_by_mem[i], -+ sizeof(shrinkers_by_mem[0]) * (nr - i)); -+ nr++; -+ } else if (i) { -+ i--; -+ memmove(&shrinkers_by_mem[0], -+ &shrinkers_by_mem[1], -+ sizeof(shrinkers_by_mem[0]) * i); -+ } else { -+ continue; -+ } -+ -+ shrinkers_by_mem[i] = (struct shrinker_by_mem) { -+ .shrinker = shrinker, -+ .mem = mem, -+ }; -+ } -+ -+ for (i = nr - 1; i >= 0; --i) -+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker); -+ -+ up_read(&shrinker_rwsem); -+} -+ - #define SHRINK_BATCH 128 - - static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, -@@ -899,12 +962,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - unsigned long ret; - unsigned long nr_to_scan = min(batch_size, total_scan); - -+ atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free); -+ - shrinkctl->nr_to_scan = nr_to_scan; - shrinkctl->nr_scanned = nr_to_scan; - ret = shrinker->scan_objects(shrinker, shrinkctl); - if (ret == SHRINK_STOP) - break; -+ - freed += ret; -+ atomic_long_add(ret, &shrinker->objects_freed); - - count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); - total_scan -= shrinkctl->nr_scanned; -diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include -index 7778cc97a..5341736f2 100644 ---- a/scripts/Kbuild.include -+++ b/scripts/Kbuild.include -@@ -277,3 +277,13 @@ ifneq ($(and $(filter notintermediate, $(.FEATURES)),$(filter-out 4.4,$(MAKE_VER - else - .SECONDARY: - endif -+ -+ # expand_parents(a/b/c) = a/b/c a/b a -+expand_parents2 = $(if $(subst .,,$(1)),$(call expand_parents,$(1)),) -+expand_parents = $(1) $(call expand_parents2,$(patsubst %/,%,$(dir $(1)))) -+ -+# flatten_dirs(a/b/c) = a_b_c a_b a -+flatten_dirs = $(subst /,_,$(call expand_parents,$(1))) -+ -+# eval_vars(X_,a/b/c) = $(X_a_b_c) $(X_a_b) $(X_a) -+eval_vars = $(foreach var,$(call flatten_dirs,$(2)),$($(1)$(var))) -diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib -index 68d0134bd..48ded392d 100644 ---- a/scripts/Makefile.lib -+++ b/scripts/Makefile.lib -@@ -148,7 +148,7 @@ _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds) - # - ifeq ($(CONFIG_GCOV_KERNEL),y) - _c_flags += $(if $(patsubst n%,, \ -- $(GCOV_PROFILE_$(basetarget).o)$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \ -+ $(GCOV_PROFILE_$(basetarget).o)$(call eval_vars,GCOV_PROFILE_,$(src))$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \ - $(CFLAGS_GCOV)) - endif - -diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c -index 653b92f6d..47978efe4 100644 ---- a/scripts/kallsyms.c -+++ b/scripts/kallsyms.c -@@ -204,6 +204,11 @@ static int symbol_in_range(const struct sym_entry *s, - return 0; - } - -+static bool string_starts_with(const char *s, const char *prefix) -+{ -+ return strncmp(s, prefix, strlen(prefix)) == 0; -+} -+ - static int symbol_valid(const struct sym_entry *s) - { - const char *name = sym_name(s); -@@ -211,6 +216,14 @@ static int symbol_valid(const struct sym_entry *s) - /* if --all-symbols is not specified, then symbols outside the text - * and inittext sections are discarded */ - if (!all_symbols) { -+ /* -+ * Symbols starting with __start and __stop are used to denote -+ * section boundaries, and should always be included: -+ */ -+ if (string_starts_with(name, "__start_") || -+ string_starts_with(name, "__stop_")) -+ return 1; -+ - if (symbol_in_range(s, text_ranges, - ARRAY_SIZE(text_ranges)) == 0) - return 0; --- -2.42.0 diff --git a/scripts/config.sh b/scripts/config.sh index 99ff444..562540c 100755 --- a/scripts/config.sh +++ b/scripts/config.sh @@ -8,8 +8,8 @@ scripts/config -k -e CONFIG_GENERIC_CPU scripts/config -e CACHY scripts/config -e SCHED_BORE -scripts/config -e HZ_300 --set-val HZ 1000 -scripts/config -d HZ_PERIODIC -d NO_HZ_FULL -e NO_HZ_IDLE -e NO_HZ -e NO_HZ_COMMON +scripts/config -e HZ_300 --set-val HZ 500 +scripts/config -d HZ_PERIODIC -d NO_HZ_IDLE -d CONTEXT_TRACKING_FORCE -e NO_HZ_FULL_NODEF -e NO_HZ_FULL -e NO_HZ -e NO_HZ_COMMON -e CONTEXT_TRACKING scripts/config -e PREEMPT_BUILD -d PREEMPT_NONE -d PREEMPT_VOLUNTARY -e PREEMPT -e PREEMPT_COUNT -e PREEMPTION -e PREEMPT_DYNAMIC scripts/config -d CC_OPTIMIZE_FOR_PERFORMANCE \ @@ -17,13 +17,21 @@ scripts/config -d CC_OPTIMIZE_FOR_PERFORMANCE \ scripts/config -m TCP_CONG_CUBIC \ -d DEFAULT_CUBIC \ - -e TCP_CONG_BBR2 \ - -e DEFAULT_BBR2 \ - --set-str DEFAULT_TCP_CONG bbr2 + -e TCP_CONG_BBR \ + -e DEFAULT_BBR \ + --set-str DEFAULT_TCP_CONG bbr + +scripts/config -m NET_SCH_FQ_CODEL \ + -e NET_SCH_FQ \ + -d DEFAULT_FQ_CODEL \ + -e DEFAULT_FQ \ + --set-str DEFAULT_NET_SCH fq scripts/config -e LRU_GEN -e LRU_GEN_ENABLED -d LRU_GEN_STATS -scripts/config -d TRANSPARENT_HUGEPAGE_ALWAYS -e TRANSPARENT_HUGEPAGE_MADVISE +scripts/config -d TRANSPARENT_HUGEPAGE_MADVISE -e TRANSPARENT_HUGEPAGE_ALWAYS + +scripts/config -e PER_VMA_LOCK -d PER_VMA_LOCK_STATS scripts/config -e DAMON \ -e DAMON_VADDR \ @@ -33,13 +41,6 @@ scripts/config -e DAMON \ -e DAMON_RECLAIM \ -e DAMON_LRU_SORT -scripts/config -d ZRAM_DEF_COMP_LZORLE \ - -e ZRAM_DEF_COMP_ZSTD \ - --set-str ZRAM_DEF_COMP zstd \ - -d ZSWAP_COMPRESSOR_DEFAULT_LZ4 \ - -e ZSWAP_COMPRESSOR_DEFAULT_ZSTD \ - --set-str ZSWAP_COMPRESSOR_DEFAULT zstd - scripts/config --set-val MODULE_COMPRESS_ZSTD_LEVEL 19 -e MODULE_COMPRESS_ZSTD_ULTRA --set-val MODULE_COMPRESS_ZSTD_LEVEL_ULTRA 22 --set-val ZSTD_COMP_VAL 22 scripts/config -e EFI_HANDOVER_PROTOCOL @@ -47,8 +48,11 @@ scripts/config -e EFI_HANDOVER_PROTOCOL scripts/config -e USER_NS scripts/config -d DEBUG_INFO \ + -d DEBUG_INFO_BTF \ -d DEBUG_INFO_DWARF4 \ -d DEBUG_INFO_DWARF5 \ + -d PAHOLE_HAS_SPLIT_BTF \ + -d DEBUG_INFO_BTF_MODULES \ -d SLUB_DEBUG \ -d PM_DEBUG \ -d PM_ADVANCED_DEBUG \ diff --git a/scripts/output.sh b/scripts/output.sh index 7659f11..10c2b93 100755 --- a/scripts/output.sh +++ b/scripts/output.sh @@ -7,5 +7,5 @@ rm ./linux-libc*.deb for f in *.deb; do - cp $f ./output/lunar_$f + cp $f ./output/mantic_$f done diff --git a/scripts/patch.sh b/scripts/patch.sh index 0cf2285..6ee8562 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -9,8 +9,6 @@ patch -Np1 < "../patches/0001-cachy-all.patch" patch -Np1 < "../patches/0002-eevdf.patch" # orig patch from cachy patch -Np1 < "../patches/0002-eevdfbore.patch" -# orig patch from cachy -#patch -Np1 < "../patches/0003-bcachefs.patch" # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork # Allow setting custom pollrates for usb devices patch -Np1 < "../patches/0004-Allow-to-set-custom-USB-pollrate-for-specific-device.patch" diff --git a/scripts/source.sh b/scripts/source.sh index ec6b297..b82d5bc 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.5.tar.gz -tar -xf ./linux-6.5.tar.gz +wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.5.5.tar.gz +tar -xf ./linux-6.5.5.tar.gz -cd linux-6.5 +cd linux-6.5.5