From 3e7036b67d953ed23998cbf60b240da02f31e277 Mon Sep 17 00:00:00 2001 From: ferrreo Date: Fri, 28 Apr 2023 19:52:05 +0100 Subject: [PATCH] Update cachy patchset --- patches/0001-cachy-all.patch | 16604 ++++++++++++++++++++++++++++++++- 1 file changed, 16519 insertions(+), 85 deletions(-) diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index b307560..f7a6979 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,7 +1,7 @@ -From a2522409b71cfd3a4f7fc95effca4c322adaf7b0 Mon Sep 17 00:00:00 2001 +From 0ca55b20120a052c587868cb3199edaa41634a3b Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 6 Mar 2023 18:43:03 +0100 -Subject: [PATCH 1/8] bbr2 +Subject: [PATCH 01/10] bbr2 Signed-off-by: Peter Jung --- @@ -3283,10 +3283,10 @@ index cb79127f45c3..70e4de876a7f 100644 -- 2.40.1 -From 0d9e557b60746641c464bab65aae86fd78cb9024 Mon Sep 17 00:00:00 2001 +From 0927bc0b168ee599f356a757df60102be68472dc Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Apr 2023 18:21:50 +0200 -Subject: [PATCH 2/8] bfq +Subject: [PATCH 02/10] bfq Signed-off-by: Peter Jung --- @@ -3329,79 +3329,81 @@ index d9ed3108c17a..66146bbcd4af 100644 -- 2.40.1 -From 7b6e9ae435973f69a18f51d226879b128fa6026f Mon Sep 17 00:00:00 2001 +From 978269efc945dfd3e330da87db88188fab9b92c1 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 26 Apr 2023 22:04:07 +0200 -Subject: [PATCH 3/8] cachy +Date: Fri, 28 Apr 2023 19:58:48 +0200 +Subject: [PATCH 03/10] cachy Signed-off-by: Peter Jung --- - .gitignore | 1 + - .../admin-guide/kernel-parameters.txt | 12 + - Documentation/dontdiff | 1 + - Makefile | 8 +- - arch/arc/configs/axs101_defconfig | 1 + - arch/arc/configs/axs103_defconfig | 1 + - arch/arc/configs/axs103_smp_defconfig | 1 + - arch/arc/configs/haps_hs_defconfig | 1 + - arch/arc/configs/haps_hs_smp_defconfig | 1 + - arch/arc/configs/hsdk_defconfig | 1 + - arch/arc/configs/nsim_700_defconfig | 1 + - arch/arc/configs/nsimosci_defconfig | 1 + - arch/arc/configs/nsimosci_hs_defconfig | 1 + - arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + - arch/arc/configs/tb10x_defconfig | 1 + - arch/arc/configs/vdk_hs38_defconfig | 1 + - arch/arc/configs/vdk_hs38_smp_defconfig | 1 + - arch/x86/Kconfig.cpu | 416 ++++++++++- - arch/x86/Makefile | 45 +- - arch/x86/Makefile.postlink | 41 ++ - arch/x86/boot/compressed/.gitignore | 1 - - arch/x86/boot/compressed/Makefile | 10 +- - arch/x86/include/asm/pci.h | 6 + - arch/x86/include/asm/vermagic.h | 72 ++ - arch/x86/pci/common.c | 7 +- - drivers/Makefile | 15 +- - drivers/ata/ahci.c | 23 +- - drivers/cpufreq/Kconfig.x86 | 2 - - drivers/cpufreq/intel_pstate.c | 2 + - drivers/i2c/busses/Kconfig | 9 + - drivers/i2c/busses/Makefile | 1 + - drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ - drivers/i2c/busses/i2c-piix4.c | 4 +- - drivers/md/dm-crypt.c | 5 + - drivers/pci/controller/Makefile | 6 + - drivers/pci/controller/intel-nvme-remap.c | 462 +++++++++++++ - drivers/pci/quirks.c | 101 +++ - drivers/platform/x86/Kconfig | 14 + - drivers/platform/x86/Makefile | 3 + - drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ - include/linux/pagemap.h | 2 +- - include/linux/user_namespace.h | 4 + - include/net/netns/ipv4.h | 1 + - include/trace/events/tcp.h | 7 + - init/Kconfig | 39 ++ - kernel/Kconfig.hz | 24 + - kernel/fork.c | 14 + - kernel/module/Kconfig | 25 + - kernel/sched/fair.c | 20 +- - kernel/sysctl.c | 12 + - kernel/user_namespace.c | 7 + - mm/Kconfig | 2 +- - mm/compaction.c | 4 + - mm/page-writeback.c | 8 + - mm/swap.c | 5 + - mm/vmpressure.c | 4 + - mm/vmscan.c | 8 + - net/ipv4/sysctl_net_ipv4.c | 7 + - net/ipv4/tcp_input.c | 36 + - net/ipv4/tcp_ipv4.c | 2 + - scripts/Makefile.lib | 13 +- - scripts/Makefile.modinst | 7 +- - 62 files changed, 2637 insertions(+), 64 deletions(-) + .gitignore | 1 + + .../admin-guide/kernel-parameters.txt | 12 + + Documentation/dontdiff | 1 + + Makefile | 8 +- + arch/arc/configs/axs101_defconfig | 1 + + arch/arc/configs/axs103_defconfig | 1 + + arch/arc/configs/axs103_smp_defconfig | 1 + + arch/arc/configs/haps_hs_defconfig | 1 + + arch/arc/configs/haps_hs_smp_defconfig | 1 + + arch/arc/configs/hsdk_defconfig | 1 + + arch/arc/configs/nsim_700_defconfig | 1 + + arch/arc/configs/nsimosci_defconfig | 1 + + arch/arc/configs/nsimosci_hs_defconfig | 1 + + arch/arc/configs/nsimosci_hs_smp_defconfig | 1 + + arch/arc/configs/tb10x_defconfig | 1 + + arch/arc/configs/vdk_hs38_defconfig | 1 + + arch/arc/configs/vdk_hs38_smp_defconfig | 1 + + arch/x86/Kconfig.cpu | 416 ++- + arch/x86/Makefile | 45 +- + arch/x86/Makefile.postlink | 41 + + arch/x86/boot/compressed/.gitignore | 1 - + arch/x86/boot/compressed/Makefile | 10 +- + arch/x86/include/asm/pci.h | 6 + + arch/x86/include/asm/vermagic.h | 72 + + arch/x86/pci/common.c | 7 +- + drivers/Makefile | 15 +- + drivers/ata/ahci.c | 23 +- + drivers/cpufreq/Kconfig.x86 | 2 - + drivers/cpufreq/intel_pstate.c | 2 + + drivers/i2c/busses/Kconfig | 9 + + drivers/i2c/busses/Makefile | 1 + + drivers/i2c/busses/i2c-nct6775.c | 647 ++++ + drivers/i2c/busses/i2c-piix4.c | 4 +- + drivers/md/dm-crypt.c | 5 + + drivers/pci/controller/Makefile | 6 + + drivers/pci/controller/intel-nvme-remap.c | 462 +++ + drivers/pci/quirks.c | 101 + + drivers/platform/x86/Kconfig | 24 + + drivers/platform/x86/Makefile | 4 + + drivers/platform/x86/legion-laptop.c | 2783 +++++++++++++++++ + drivers/platform/x86/steamdeck.c | 523 ++++ + include/linux/pagemap.h | 2 +- + include/linux/user_namespace.h | 4 + + include/net/netns/ipv4.h | 1 + + include/trace/events/tcp.h | 7 + + init/Kconfig | 39 + + kernel/Kconfig.hz | 24 + + kernel/fork.c | 14 + + kernel/module/Kconfig | 25 + + kernel/sched/fair.c | 20 +- + kernel/sysctl.c | 12 + + kernel/user_namespace.c | 7 + + mm/Kconfig | 2 +- + mm/compaction.c | 4 + + mm/page-writeback.c | 8 + + mm/swap.c | 5 + + mm/vmpressure.c | 4 + + mm/vmscan.c | 8 + + net/ipv4/sysctl_net_ipv4.c | 7 + + net/ipv4/tcp_input.c | 36 + + net/ipv4/tcp_ipv4.c | 2 + + scripts/Makefile.lib | 13 +- + scripts/Makefile.modinst | 7 +- + 63 files changed, 5431 insertions(+), 64 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/pci/controller/intel-nvme-remap.c + create mode 100644 drivers/platform/x86/legion-laptop.c create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/.gitignore b/.gitignore @@ -5889,10 +5891,27 @@ index 44cab813bf95..25edf55de985 100644 }; diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index 4a01b315e0a9..e9ddf76b8b57 100644 +index 4a01b315e0a9..e4a6c31a80df 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig -@@ -1099,6 +1099,20 @@ config WINMATE_FM07_KEYS +@@ -641,6 +641,16 @@ config THINKPAD_LMI + To compile this driver as a module, choose M here: the module will + be called think-lmi. + ++config LEGION_LAPTOP ++ tristate "Lenovo Legion Laptop Extras" ++ depends on ACPI ++ depends on ACPI_WMI || ACPI_WMI = n ++ depends on HWMON || HWMON = n ++ select ACPI_PLATFORM_PROFILE ++ help ++ This is a driver for Lenovo Legion laptops and contains drivers for ++ hotkey, fan control, and power mode. ++ + source "drivers/platform/x86/intel/Kconfig" + + config MSI_LAPTOP +@@ -1099,6 +1109,20 @@ config WINMATE_FM07_KEYS buttons below the display. This module adds an input device that delivers key events when these buttons are pressed. @@ -5914,16 +5933,2813 @@ index 4a01b315e0a9..e9ddf76b8b57 100644 config P2SB diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile -index 1d3d1b02541b..75b30a3face9 100644 +index 1d3d1b02541b..fde9a683103e 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile -@@ -134,3 +134,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += simatic-ipc.o +@@ -66,6 +66,7 @@ obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o + obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o + obj-$(CONFIG_THINKPAD_LMI) += think-lmi.o ++obj-$(CONFIG_LEGION_LAPTOP) += legion-laptop.o + + # Intel + obj-y += intel/ +@@ -134,3 +135,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC) += simatic-ipc.o # Winmate obj-$(CONFIG_WINMATE_FM07_KEYS) += winmate-fm07-keys.o + +# Steam Deck +obj-$(CONFIG_STEAMDECK) += steamdeck.o +diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c +new file mode 100644 +index 000000000000..d1268d239cc5 +--- /dev/null ++++ b/drivers/platform/x86/legion-laptop.c +@@ -0,0 +1,2783 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * legion-laptop.c - Extra Lenovo Legion laptop support, in ++ * particular for fan curve control and power mode. ++ * ++ * Copyright (C) 2022 johnfan ++ * ++ * ++ * This driver might work on other Lenovo Legion models. If you ++ * want to try it you can pass force=1 as argument ++ * to the module which will force it to load even when the DMI ++ * data doesn't match the model AND FIRMWARE. ++ * ++ * Support for other hardware of this model is already partially ++ * provided by the module ideapd-laptop. ++ * ++ * The development page for this driver is located at ++ * https://github.com/johnfanv2/LenovoLegionLinux ++ * ++ * This driver exports the files: ++ * - /sys/kernel/debug/legion/fancurve (ro) ++ * The fan curve in the form stored in the firmware in an ++ * human readable table. ++ * ++ * - /sys/module/legion_laptop/drivers/platform\:legion/PNP0C09\:00/powermode (rw) ++ * 0: balanced mode (white) ++ * 1: performance mode (red) ++ * 2: quiet mode (blue) ++ * ?: custom mode (pink) ++ * ++ * NOTE: Writing to this will load the default fan curve from ++ * the firmware for this mode, so the fan curve might ++ * have to be reconfigured if needed. ++ * ++ * It implements the usual hwmon interface to monitor fan speed and temmperature ++ * and allows to set the fan curve inside the firware. ++ * ++ * - /sys/class/hwmon/X/fan1_input or /sys/class/hwmon/X/fan2_input (ro) ++ * Current fan speed of fan1/fan2. ++ * - /sys/class/hwmon/X/temp1_input (ro) ++ * - /sys/class/hwmon/X/temp2_input (ro) ++ * - /sys/class/hwmon/X/temp3_input (ro) ++ * Temperature (Celsius) of CPU, GPU, and IC used for fan control. ++ * - /sys/class/hwmon/X/pwmY_auto_pointZ_pwm (rw) ++ * PWM (0-255) of the fan at the Y-level in the fan curve ++ * - /sys/class/hwmon/X/pwmY_auto_pointZ_temp (rw) ++ * upper temperature of tempZ (CPU, GPU, or IC) at the Y-level in the fan curve ++ * - /sys/class/hwmon/X/pwmY_auto_pointZ_temp_hyst (rw) ++ * hysteris (CPU, GPU, or IC) at the Y-level in the fan curve. The lower ++ * temperatue of the level is the upper temperature minus the hysteris ++ * ++ * ++ * Credits for reverse engineering the firmware to: ++ * - David Woodhouse: heavily inspired by lenovo_laptop.c ++ * - Luke Cama: Windows version "LegionFanControl" ++ * - SmokelessCPU: reverse engineering of custom registers in EC ++ * and commincation method with EC via ports ++ * - 0x1F9F1: additional reverse engineering for complete fan curve ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("johnfan"); ++MODULE_DESCRIPTION("Lenovo Legion laptop extras"); ++ ++static bool force; ++module_param(force, bool, 0440); ++MODULE_PARM_DESC( ++ force, ++ "Force loading this module even if model or BIOS does not match."); ++ ++static bool ec_readonly; ++module_param(ec_readonly, bool, 0440); ++MODULE_PARM_DESC( ++ ec_readonly, ++ "Only read from embedded controller but do not write or change settings."); ++ ++#define LEGIONFEATURES \ ++ "fancurve powermode platformprofile platformprofilenotify minifancurve" ++ ++//Size of fancurve stored in embedded controller ++#define MAXFANCURVESIZE 10 ++ ++#define LEGION_DRVR_SHORTNAME "legion" ++#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon" ++ ++/* =============================== */ ++/* Embedded Controller Description */ ++/* =============================== */ ++ ++/* The configuration and registers to access the embedded controller ++ * depending on different the version of the software on the ++ * embedded controller or and the BIOS/UEFI firmware. ++ * ++ * To control fan curve in the embedded controller (EC) one has to ++ * write to its "RAM". There are different possibilities: ++ * - EC RAM is memory mapped (write to it with ioremap) ++ * - access EC RAM via ported mapped IO (outb/inb) ++ * - access EC RAM via ACPI methods. It is only possible to write ++ * to part of it (first 0xFF bytes?) ++ * ++ * In later models the firmware directly exposes ACPI methods to ++ * set the fan curve direclty, without writing to EC RAM. This ++ * is done inside the ACPI method. ++ */ ++ ++/** ++ * Offsets for interseting values inside the EC RAM (0 = start of ++ * EC RAM. These might change depending on the software inside of ++ * the EC, which can be updated by a BIOS update from Lenovo. ++ */ ++// TODO: same order as in initialization ++struct ec_register_offsets { ++ // Super I/O Configuration Registers ++ // 7.15 General Control (GCTRL) ++ // General Control (GCTRL) ++ // (see EC Interface Registers and 6.2 Plug and Play Configuration (PNPCFG)) in datasheet ++ // note: these are in two places saved ++ // in EC Interface Registers and in super io configuraion registers ++ // Chip ID ++ u16 ECHIPID1; ++ u16 ECHIPID2; ++ // Chip Version ++ u16 ECHIPVER; ++ u16 ECDEBUG; ++ ++ // Lenovo Custom OEM extension ++ // Firmware of ITE can be extended by ++ // custom program using its own "variables" ++ // These are the offsets to these "variables" ++ u16 EXT_FAN_CUR_POINT; ++ u16 EXT_FAN_POINTS_SIZE; ++ u16 EXT_FAN1_BASE; ++ u16 EXT_FAN2_BASE; ++ u16 EXT_FAN_ACC_BASE; ++ u16 EXT_FAN_DEC_BASE; ++ u16 EXT_CPU_TEMP; ++ u16 EXT_CPU_TEMP_HYST; ++ u16 EXT_GPU_TEMP; ++ u16 EXT_GPU_TEMP_HYST; ++ u16 EXT_VRM_TEMP; ++ u16 EXT_VRM_TEMP_HYST; ++ u16 EXT_FAN1_RPM_LSB; ++ u16 EXT_FAN1_RPM_MSB; ++ u16 EXT_FAN2_RPM_LSB; ++ u16 EXT_FAN2_RPM_MSB; ++ u16 EXT_FAN1_TARGET_RPM; ++ u16 EXT_FAN2_TARGET_RPM; ++ u16 EXT_POWERMODE; ++ u16 EXT_MINIFANCURVE_ON_COOL; ++ // values ++ // 0x04: enable mini fan curve if very long on cool level ++ // - this might be due to potential temp failure ++ // - or just because really so cool ++ // 0xA0: disable it ++ u16 EXT_LOCKFANCONTROLLER; ++ u16 EXT_MAXIMUMFANSPEED; ++ u16 EXT_WHITE_KEYBOARD_BACKLIGHT; ++ u16 EXT_IC_TEMP_INPUT; ++ u16 EXT_CPU_TEMP_INPUT; ++ u16 EXT_GPU_TEMP_INPUT; ++}; ++ ++struct model_config { ++ const struct ec_register_offsets *registers; ++ bool check_embedded_controller_id; ++ u16 embedded_controller_id; ++ ++ // first addr in EC we access/scan ++ phys_addr_t memoryio_physical_ec_start; ++ size_t memoryio_size; ++ ++ // TODO: maybe use bitfield ++ bool has_minifancurve; ++}; ++ ++/* =================================== */ ++/* Coinfiguration for different models */ ++/* =================================== */ ++ ++// Idea by SmokelesssCPU (modified) ++// - all default names and register addresses are supported by datasheet ++// - register addresses for custom firmware by SmokelesssCPU ++static const struct ec_register_offsets ec_register_offsets_v0 = { ++ .ECHIPID1 = 0x2000, ++ .ECHIPID2 = 0x2001, ++ .ECHIPVER = 0x2002, ++ .ECDEBUG = 0x2003, ++ .EXT_FAN_CUR_POINT = 0xC534, ++ .EXT_FAN_POINTS_SIZE = 0xC535, ++ .EXT_FAN1_BASE = 0xC540, ++ .EXT_FAN2_BASE = 0xC550, ++ .EXT_FAN_ACC_BASE = 0xC560, ++ .EXT_FAN_DEC_BASE = 0xC570, ++ .EXT_CPU_TEMP = 0xC580, ++ .EXT_CPU_TEMP_HYST = 0xC590, ++ .EXT_GPU_TEMP = 0xC5A0, ++ .EXT_GPU_TEMP_HYST = 0xC5B0, ++ .EXT_VRM_TEMP = 0xC5C0, ++ .EXT_VRM_TEMP_HYST = 0xC5D0, ++ .EXT_FAN1_RPM_LSB = 0xC5E0, ++ .EXT_FAN1_RPM_MSB = 0xC5E1, ++ .EXT_FAN2_RPM_LSB = 0xC5E2, ++ .EXT_FAN2_RPM_MSB = 0xC5E3, ++ .EXT_MINIFANCURVE_ON_COOL = 0xC536, ++ .EXT_LOCKFANCONTROLLER = 0xc4AB, ++ .EXT_CPU_TEMP_INPUT = 0xc538, ++ .EXT_GPU_TEMP_INPUT = 0xc539, ++ .EXT_IC_TEMP_INPUT = 0xC5E8, ++ .EXT_POWERMODE = 0xc420, ++ .EXT_FAN1_TARGET_RPM = 0xc600, ++ .EXT_FAN2_TARGET_RPM = 0xc601, ++ .EXT_MAXIMUMFANSPEED = 0xBD, ++ .EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400) ++}; ++ ++static const struct model_config model_v0 = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = true ++}; ++ ++static const struct model_config model_kfcn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = true, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false ++}; ++ ++static const struct model_config model_hacn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, ++ .memoryio_size = 0x300, ++ .has_minifancurve = false ++}; ++ ++ ++static const struct model_config model_k9cn = { ++ .registers = &ec_register_offsets_v0, ++ .check_embedded_controller_id = false, ++ .embedded_controller_id = 0x8227, ++ .memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400 ? ++ .memoryio_size = 0x300, ++ .has_minifancurve = false ++}; ++ ++ ++ ++static const struct dmi_system_id denylist[] = { {} }; ++ ++static const struct dmi_system_id optimistic_allowlist[] = { ++ { ++ // modelyear: 2021 ++ // generation: 6 ++ // name: Legion 5, Legion 5 pro, Legion 7 ++ // Family: Legion 5 15ACH6H, ... ++ .ident = "GKCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "GKCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2020 ++ .ident = "EUCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "EUCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2020 ++ .ident = "EFCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "EFCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2020 ++ .ident = "FSCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "FSCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2021 ++ .ident = "HHCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "HHCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2022 ++ .ident = "H1CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "H1CN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2022 ++ .ident = "J2CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "J2CN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2022 ++ .ident = "JUCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "JUCN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2022 ++ .ident = "KFCN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "KFCN"), ++ }, ++ .driver_data = (void *)&model_kfcn ++ }, ++ { ++ // modelyear: 2021 ++ .ident = "HACN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "HACN"), ++ }, ++ .driver_data = (void *)&model_hacn ++ }, ++ { ++ // modelyear: 2021 ++ .ident = "G9CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "G9CN"), ++ }, ++ .driver_data = (void *)&model_v0 ++ }, ++ { ++ // modelyear: 2022 ++ .ident = "K9CN", ++ .matches = { ++ DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), ++ DMI_MATCH(DMI_BIOS_VERSION, "K9CN"), ++ }, ++ .driver_data = (void *)&model_k9cn ++ }, ++ {} ++}; ++ ++/* ================================= */ ++/* ACPI access */ ++/* ================================= */ ++ ++// function from ideapad-laptop.c ++static int eval_int(acpi_handle handle, const char *name, unsigned long *res) ++{ ++ unsigned long long result; ++ acpi_status status; ++ ++ status = acpi_evaluate_integer(handle, (char *)name, NULL, &result); ++ if (ACPI_FAILURE(status)) ++ return -EIO; ++ ++ *res = result; ++ ++ return 0; ++} ++ ++// function from ideapad-laptop.c ++static int exec_simple_method(acpi_handle handle, const char *name, ++ unsigned long arg) ++{ ++ acpi_status status = ++ acpi_execute_simple_method(handle, (char *)name, arg); ++ ++ return ACPI_FAILURE(status) ? -EIO : 0; ++} ++ ++// function from ideapad-laptop.c ++static int exec_sbmc(acpi_handle handle, unsigned long arg) ++{ ++ // \_SB.PCI0.LPC0.EC0.VPC0.SBMC ++ return exec_simple_method(handle, "SBMC", arg); ++} ++ ++static int eval_qcho(acpi_handle handle, unsigned long *res) ++{ ++ // \_SB.PCI0.LPC0.EC0.QCHO ++ return eval_int(handle, "QCHO", res); ++} ++ ++/* ================================= */ ++/* EC RAM Access with port-mapped IO */ ++/* ================================= */ ++ ++/* ++ * See datasheet of e.g. IT8502E/F/G, e.g. ++ * 6.2 Plug and Play Configuration (PNPCFG) ++ * ++ * Depending on configured BARDSEL register ++ * the ports ++ * ECRAM_PORTIO_ADDR_PORT and ++ * ECRAM_PORTIO_DATA_PORT ++ * are configured. ++ * ++ * By performing IO on these ports one can ++ * read/write to registers in the EC. ++ * ++ * "To access a register of PNPCFG, write target index to ++ * address port and access this PNPCFG register via ++ * data port" [datasheet, 6.2 Plug and Play Configuration] ++ */ ++ ++// IO ports used to write to communicate with embedded controller ++// Start of used ports ++#define ECRAM_PORTIO_START_PORT 0x4E ++// Number of used ports ++#define ECRAM_PORTIO_PORTS_SIZE 2 ++// Port used to specify address in EC RAM to read/write ++// 0x4E/0x4F is the usual port for IO super controler ++// 0x2E/0x2F also common (ITE can also be configure to use these) ++#define ECRAM_PORTIO_ADDR_PORT 0x4E ++// Port to send/receive the value to write/read ++#define ECRAM_PORTIO_DATA_PORT 0x4F ++// Name used to request ports ++#define ECRAM_PORTIO_NAME "legion" ++ ++struct ecram_portio { ++ /* protects read/write to EC RAM performed ++ * as a certain sequence of outb, inb ++ * commands on the IO ports. There can ++ * be at most one. ++ */ ++ struct mutex io_port_mutex; ++}; ++ ++ssize_t ecram_portio_init(struct ecram_portio *ec_portio) ++{ ++ if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE, ++ ECRAM_PORTIO_NAME)) { ++ pr_info("Cannot init ecram_portio the %x ports starting at %x\n", ++ ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT); ++ return -ENODEV; ++ } ++ //pr_info("Reserved %x ports starting at %x\n", ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT); ++ mutex_init(&ec_portio->io_port_mutex); ++ return 0; ++} ++ ++void ecram_portio_exit(struct ecram_portio *ec_portio) ++{ ++ release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE); ++} ++ ++/* Read a byte from the EC RAM. ++ * ++ * Return status because of commong signature for alle ++ * methods to access EC RAM. ++ */ ++ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, u8 *value) ++{ ++ mutex_lock(&ec_portio->io_port_mutex); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x11, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ // TODO: no explicit cast between types seems to be sometimes ++ // done and sometimes not ++ outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x10, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x12, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ *value = inb(ECRAM_PORTIO_DATA_PORT); ++ ++ mutex_unlock(&ec_portio->io_port_mutex); ++ return 0; ++} ++ ++/* Write a byte to the EC RAM. ++ * ++ * Return status because of commong signature for alle ++ * methods to access EC RAM. ++ */ ++ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, u8 value) ++{ ++ mutex_lock(&ec_portio->io_port_mutex); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x11, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ // TODO: no explicit cast between types seems to be sometimes ++ // done and sometimes not ++ outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x10, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT); ++ ++ outb(0x2E, ECRAM_PORTIO_ADDR_PORT); ++ outb(0x12, ECRAM_PORTIO_DATA_PORT); ++ outb(0x2F, ECRAM_PORTIO_ADDR_PORT); ++ outb(value, ECRAM_PORTIO_DATA_PORT); ++ ++ mutex_unlock(&ec_portio->io_port_mutex); ++ return 0; ++} ++ ++/* =================================== */ ++/* EC RAM Access */ ++/* =================================== */ ++ ++struct ecram { ++ struct ecram_portio portio; ++}; ++ ++ssize_t ecram_init(struct ecram *ecram, phys_addr_t memoryio_ec_physical_start, ++ size_t region_size) ++{ ++ ssize_t err; ++ ++ err = ecram_portio_init(&ecram->portio); ++ if (err) { ++ pr_info("Failed ecram_portio_init\n"); ++ goto err_ecram_portio_init; ++ } ++ ++ return 0; ++ ++err_ecram_portio_init: ++ return err; ++} ++ ++void ecram_exit(struct ecram *ecram) ++{ ++ pr_info("Unloading legion ecram\n"); ++ ecram_portio_exit(&ecram->portio); ++ pr_info("Unloading legion ecram done\n"); ++} ++ ++/** ++ * ecram_offset address on the EC ++ */ ++static u8 ecram_read(struct ecram *ecram, u16 ecram_offset) ++{ ++ u8 value; ++ int err; ++ ++ err = ecram_portio_read(&ecram->portio, ecram_offset, &value); ++ if (err) ++ pr_info("Error reading EC RAM at 0x%x\n", ecram_offset); ++ return value; ++} ++ ++static void ecram_write(struct ecram *ecram, u16 ecram_offset, u8 value) ++{ ++ int err; ++ ++ if (ec_readonly) { ++ pr_info("Skipping writing EC RAM at 0x%x because readonly.\n", ++ ecram_offset); ++ return; ++ } ++ err = ecram_portio_write(&ecram->portio, ecram_offset, value); ++ if (err) ++ pr_info("Error writing EC RAM at 0x%x\n", ecram_offset); ++} ++ ++/* =============================== */ ++/* Reads from EC */ ++/* =============================== */ ++ ++u16 read_ec_id(struct ecram *ecram, const struct model_config *model) ++{ ++ u8 id1 = ecram_read(ecram, model->registers->ECHIPID1); ++ u8 id2 = ecram_read(ecram, model->registers->ECHIPID2); ++ ++ return (id1 << 8) + id2; ++} ++ ++u16 read_ec_version(struct ecram *ecram, const struct model_config *model) ++{ ++ u8 vers = ecram_read(ecram, model->registers->ECHIPVER); ++ u8 debug = ecram_read(ecram, model->registers->ECDEBUG); ++ ++ return (vers << 8) + debug; ++} ++ ++/* ============================= */ ++/* Data model for sensor values */ ++/* ============================ */ ++ ++struct sensor_values { ++ u16 fan1_rpm; // current speed in rpm of fan 1 ++ u16 fan2_rpm; // current speed in rpm of fan2 ++ u16 fan1_target_rpm; // target speed in rpm of fan 1 ++ u16 fan2_target_rpm; // target speed in rpm of fan 2 ++ u8 cpu_temp_celsius; // cpu temperature in celcius ++ u8 gpu_temp_celsius; // gpu temperature in celcius ++ u8 ic_temp_celsius; // ic temperature in celcius ++}; ++ ++enum SENSOR_ATTR { ++ SENSOR_CPU_TEMP_ID = 1, ++ SENSOR_GPU_TEMP_ID = 2, ++ SENSOR_IC_TEMP_ID = 3, ++ SENSOR_FAN1_RPM_ID = 4, ++ SENSOR_FAN2_RPM_ID = 5, ++ SENSOR_FAN1_TARGET_RPM_ID = 6, ++ SENSOR_FAN2_TARGET_RPM_ID = 7 ++}; ++ ++static int read_sensor_values(struct ecram *ecram, ++ const struct model_config *model, ++ struct sensor_values *values) ++{ ++ values->fan1_target_rpm = ++ 100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM); ++ values->fan2_target_rpm = ++ 100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM); ++ ++ values->fan1_rpm = ++ ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) + ++ (((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB)) ++ << 8); ++ values->fan2_rpm = ++ ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) + ++ (((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB)) ++ << 8); ++ ++ values->cpu_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT); ++ values->gpu_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT); ++ values->ic_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT); ++ ++ values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6); ++ values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7); ++ values->ic_temp_celsius = ecram_read(ecram, 0xC5E8); ++ ++ return 0; ++} ++ ++/* =============================== */ ++/* Behaviour changing functions */ ++/* =============================== */ ++ ++int read_powermode(struct ecram *ecram, const struct model_config *model) ++{ ++ return ecram_read(ecram, model->registers->EXT_POWERMODE); ++} ++ ++ssize_t write_powermode(struct ecram *ecram, const struct model_config *model, ++ u8 value) ++{ ++ if (!(value >= 0 && value <= 2)) { ++ pr_info("Unexpected power mode value ignored: %d\n", value); ++ return -ENOMEM; ++ } ++ ecram_write(ecram, model->registers->EXT_POWERMODE, value); ++ return 0; ++} ++ ++/** ++ * Shortly toggle powermode to a different mode ++ * and switch back, e.g. to reset fan curve. ++ */ ++void toggle_powermode(struct ecram *ecram, const struct model_config *model) ++{ ++ int old_powermode = read_powermode(ecram, model); ++ int next_powermode = old_powermode == 0 ? 1 : 0; ++ ++ write_powermode(ecram, model, next_powermode); ++ mdelay(1500); ++ write_powermode(ecram, model, old_powermode); ++} ++ ++#define lockfancontroller_ON 8 ++#define lockfancontroller_OFF 0 ++ ++ssize_t write_lockfancontroller(struct ecram *ecram, ++ const struct model_config *model, bool state) ++{ ++ u8 val = state ? lockfancontroller_ON : lockfancontroller_OFF; ++ ++ ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val); ++ return 0; ++} ++ ++int read_lockfancontroller(struct ecram *ecram, ++ const struct model_config *model, bool *state) ++{ ++ int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER); ++ ++ switch (value) { ++ case lockfancontroller_ON: ++ *state = true; ++ break; ++ case lockfancontroller_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in lockfanspeed register:%d\n", ++ value); ++ return -1; ++ } ++ return 0; ++} ++ ++#define MAXIMUMFANSPEED_ON 0x40 ++#define MAXIMUMFANSPEED_OFF 0x00 ++ ++int read_maximumfanspeed(struct ecram *ecram, const struct model_config *model, ++ bool *state) ++{ ++ int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED); ++ ++ switch (value) { ++ case MAXIMUMFANSPEED_ON: ++ *state = true; ++ break; ++ case MAXIMUMFANSPEED_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in maximumfanspeed register:%d\n", ++ value); ++ return -1; ++ } ++ return 0; ++} ++ ++ssize_t write_maximumfanspeed(struct ecram *ecram, ++ const struct model_config *model, bool state) ++{ ++ u8 val = state ? MAXIMUMFANSPEED_ON : MAXIMUMFANSPEED_OFF; ++ ++ ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val); ++ return 0; ++} ++ ++#define MINIFANCUVE_ON_COOL_ON 0x04 ++#define MINIFANCUVE_ON_COOL_OFF 0xA0 ++ ++int read_minifancurve(struct ecram *ecram, const struct model_config *model, ++ bool *state) ++{ ++ int value = ++ ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL); ++ ++ switch (value) { ++ case MINIFANCUVE_ON_COOL_ON: ++ *state = true; ++ break; ++ case MINIFANCUVE_ON_COOL_OFF: ++ *state = false; ++ break; ++ default: ++ pr_info("Unexpected value in MINIFANCURVE register:%d\n", ++ value); ++ return -1; ++ } ++ return 0; ++} ++ ++ssize_t write_minifancurve(struct ecram *ecram, ++ const struct model_config *model, bool state) ++{ ++ u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF; ++ ++ ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val); ++ return 0; ++} ++ ++#define KEYBOARD_BACKLIGHT_OFF 18 ++#define KEYBOARD_BACKLIGHT_ON1 21 ++#define KEYBOARD_BACKLIGHT_ON2 23 ++ ++int read_keyboard_backlight(struct ecram *ecram, ++ const struct model_config *model, int *state) ++{ ++ int value = ecram_read(ecram, ++ model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT); ++ ++ //switch (value) { ++ //case MINIFANCUVE_ON_COOL_ON: ++ // *state = true; ++ // break; ++ //case MINIFANCUVE_ON_COOL_OFF: ++ // *state = false; ++ // break; ++ //default: ++ // pr_info("Unexpected value in MINIFANCURVE register:%d\n", ++ // value); ++ // return -1; ++ //} ++ *state = value; ++ return 0; ++} ++ ++int write_keyboard_backlight(struct ecram *ecram, ++ const struct model_config *model, int state) ++{ ++ u8 val = state > 0 ? KEYBOARD_BACKLIGHT_ON1 : KEYBOARD_BACKLIGHT_OFF; ++ ++ ecram_write(ecram, model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT, val); ++ return 0; ++} ++ ++#define FCT_RAPID_CHARGE_ON 0x07 ++#define FCT_RAPID_CHARGE_OFF 0x08 ++#define RAPID_CHARGE_ON 0x0 ++#define RAPID_CHARGE_OFF 0x1 ++ ++int read_rapidcharge(acpi_handle acpihandle, int *state) ++{ ++ unsigned long result; ++ int err; ++ ++ err = eval_qcho(acpihandle, &result); ++ if (err) ++ return err; ++ ++ *state = result; ++ return 0; ++} ++ ++int write_rapidcharge(acpi_handle acpihandle, bool state) ++{ ++ unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON : ++ FCT_RAPID_CHARGE_OFF; ++ return exec_sbmc(acpihandle, fct_nr); ++} ++ ++/* ============================= */ ++/* Data model for fan curve */ ++/* ============================ */ ++ ++struct fancurve_point { ++ // rpm1 devided by 100 ++ u8 rpm1_raw; ++ // rpm2 devided by 100 ++ u8 rpm2_raw; ++ // >=2 , <=5 (lower is faster); must be increasing by level ++ u8 accel; ++ // >=2 , <=5 (lower is faster); must be increasing by level ++ u8 decel; ++ ++ // min must be lower or equal than max ++ // last level max must be 127 ++ // <=127 cpu max temp for this level; must be increasing by level ++ u8 cpu_max_temp_celsius; ++ // <=127 cpu min temp for this level; must be increasing by level ++ u8 cpu_min_temp_celsius; ++ // <=127 gpu min temp for this level; must be increasing by level ++ u8 gpu_max_temp_celsius; ++ // <=127 gpu max temp for this level; must be increasing by level ++ u8 gpu_min_temp_celsius; ++ // <=127 ic max temp for this level; must be increasing by level ++ u8 ic_max_temp_celsius; ++ // <=127 ic max temp for this level; must be increasing by level ++ u8 ic_min_temp_celsius; ++}; ++ ++enum FANCURVE_ATTR { ++ FANCURVE_ATTR_PWM1 = 1, ++ FANCURVE_ATTR_PWM2 = 2, ++ FANCURVE_ATTR_CPU_TEMP = 3, ++ FANCURVE_ATTR_CPU_HYST = 4, ++ FANCURVE_ATTR_GPU_TEMP = 5, ++ FANCURVE_ATTR_GPU_HYST = 6, ++ FANCURVE_ATTR_IC_TEMP = 7, ++ FANCURVE_ATTR_IC_HYST = 8, ++ FANCURVE_ATTR_ACCEL = 9, ++ FANCURVE_ATTR_DECEL = 10, ++ FANCURVE_SIZE = 11, ++ FANCURVE_MINIFANCURVE_ON_COOL = 12 ++}; ++ ++// used for clearing table entries ++static const struct fancurve_point fancurve_point_zero = { 0, 0, 0, 0, 0, ++ 0, 0, 0, 0, 0 }; ++ ++struct fancurve { ++ struct fancurve_point points[MAXFANCURVESIZE]; ++ // number of points used; must be <= MAXFANCURVESIZE ++ size_t size; ++ // the point that at which fans are run currently ++ size_t current_point_i; ++}; ++ ++// calculate derived values ++ ++int fancurve_get_cpu_deltahyst(struct fancurve_point *point) ++{ ++ return ((int)point->cpu_max_temp_celsius) - ++ ((int)point->cpu_min_temp_celsius); ++} ++ ++int fancurve_get_gpu_deltahyst(struct fancurve_point *point) ++{ ++ return ((int)point->gpu_max_temp_celsius) - ++ ((int)point->gpu_min_temp_celsius); ++} ++ ++int fancurve_get_ic_deltahyst(struct fancurve_point *point) ++{ ++ return ((int)point->ic_max_temp_celsius) - ++ ((int)point->ic_min_temp_celsius); ++} ++ ++// validation functions ++ ++bool fancurve_is_valid_min_temp(int min_temp) ++{ ++ return min_temp >= 0 && min_temp <= 127; ++} ++ ++bool fancurve_is_valid_max_temp(int max_temp) ++{ ++ return max_temp >= 0 && max_temp <= 127; ++} ++ ++// setters with validation ++// - make hwmon implementation easier ++// - keep fancurve valid, otherwise EC will not properly control fan ++ ++bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm) ++{ ++ bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500); ++ ++ if (valid) ++ fancurve->points[point_id].rpm1_raw = rpm / 100; ++ return valid; ++} ++ ++bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm) ++{ ++ bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500); ++ ++ if (valid) ++ fancurve->points[point_id].rpm2_raw = rpm / 100; ++ return valid; ++} ++ ++// TODO: remove { ... } from single line if body ++ ++bool fancurve_set_accel(struct fancurve *fancurve, int point_id, int accel) ++{ ++ bool valid = accel >= 2 && accel <= 5; ++ ++ if (valid) ++ fancurve->points[point_id].accel = accel; ++ return valid; ++} ++ ++bool fancurve_set_decel(struct fancurve *fancurve, int point_id, int decel) ++{ ++ bool valid = decel >= 2 && decel <= 5; ++ ++ if (valid) ++ fancurve->points[point_id].decel = decel; ++ return valid; ++} ++ ++bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].cpu_max_temp_celsius = value; ++ ++ return valid; ++} ++ ++bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].gpu_max_temp_celsius = value; ++ return valid; ++} ++ ++bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].ic_max_temp_celsius = value; ++ return valid; ++} ++ ++bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].cpu_min_temp_celsius = value; ++ return valid; ++} ++ ++bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].gpu_min_temp_celsius = value; ++ return valid; ++} ++ ++bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id, ++ int value) ++{ ++ bool valid = fancurve_is_valid_max_temp(value); ++ ++ if (valid) ++ fancurve->points[point_id].ic_min_temp_celsius = value; ++ return valid; ++} ++ ++bool fancurve_set_size(struct fancurve *fancurve, int size, bool init_values) ++{ ++ bool valid = size >= 1 && size <= MAXFANCURVESIZE; ++ ++ if (!valid) ++ return false; ++ if (init_values && size < fancurve->size) { ++ // fancurve size is decreased, but last etnry alwasy needs 127 temperatures ++ // Note: size >=1 ++ fancurve->points[size - 1].cpu_max_temp_celsius = 127; ++ fancurve->points[size - 1].ic_max_temp_celsius = 127; ++ fancurve->points[size - 1].gpu_max_temp_celsius = 127; ++ } ++ if (init_values && size > fancurve->size) { ++ // fancurve increased, so new entries need valid values ++ int i; ++ int last = fancurve->size > 0 ? fancurve->size - 1 : 0; ++ ++ for (i = fancurve->size; i < size; ++i) ++ fancurve->points[i] = fancurve->points[last]; ++ } ++ return true; ++} ++ ++/* Read the fan curve from the EC. ++ * ++ * In newer models (>=2022) there is an ACPI/WMI to read fan curve as ++ * a whole. So read/write fan table as a whole to use ++ * same interface for both cases. ++ * ++ * It reads all points from EC memory, even if stored fancurve is smaller, so ++ * it can contain 0 entries. ++ */ ++static int read_fancurve(struct ecram *ecram, const struct model_config *model, ++ struct fancurve *fancurve) ++{ ++ size_t i = 0; ++ ++ for (i = 0; i < MAXFANCURVESIZE; ++i) { ++ struct fancurve_point *point = &fancurve->points[i]; ++ ++ point->rpm1_raw = ++ ecram_read(ecram, model->registers->EXT_FAN1_BASE + i); ++ point->rpm2_raw = ++ ecram_read(ecram, model->registers->EXT_FAN2_BASE + i); ++ ++ point->accel = ecram_read( ++ ecram, model->registers->EXT_FAN_ACC_BASE + i); ++ point->decel = ecram_read( ++ ecram, model->registers->EXT_FAN_DEC_BASE + i); ++ point->cpu_max_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_CPU_TEMP + i); ++ point->cpu_min_temp_celsius = ecram_read( ++ ecram, model->registers->EXT_CPU_TEMP_HYST + i); ++ point->gpu_max_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_GPU_TEMP + i); ++ point->gpu_min_temp_celsius = ecram_read( ++ ecram, model->registers->EXT_GPU_TEMP_HYST + i); ++ point->ic_max_temp_celsius = ++ ecram_read(ecram, model->registers->EXT_VRM_TEMP + i); ++ point->ic_min_temp_celsius = ecram_read( ++ ecram, model->registers->EXT_VRM_TEMP_HYST + i); ++ } ++ ++ // Do not trust that hardware; It might suddendly report ++ // a larger size, so clamp it. ++ fancurve->size = ++ ecram_read(ecram, model->registers->EXT_FAN_POINTS_SIZE); ++ fancurve->size = ++ min(fancurve->size, (typeof(fancurve->size))(MAXFANCURVESIZE)); ++ fancurve->current_point_i = ++ ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT); ++ fancurve->current_point_i = ++ min(fancurve->current_point_i, fancurve->size); ++ return 0; ++} ++ ++static int write_fancurve(struct ecram *ecram, const struct model_config *model, ++ const struct fancurve *fancurve, bool write_size) ++{ ++ size_t i; ++ // Reset fan update counters (try to avoid any race conditions) ++ ecram_write(ecram, 0xC5FE, 0); ++ ecram_write(ecram, 0xC5FF, 0); ++ for (i = 0; i < MAXFANCURVESIZE; ++i) { ++ // Entries for points larger than fancurve size should be cleared ++ // to 0 ++ const struct fancurve_point *point = ++ i < fancurve->size ? &fancurve->points[i] : ++ &fancurve_point_zero; ++ ++ ecram_write(ecram, model->registers->EXT_FAN1_BASE + i, ++ point->rpm1_raw); ++ ecram_write(ecram, model->registers->EXT_FAN2_BASE + i, ++ point->rpm2_raw); ++ ++ ecram_write(ecram, model->registers->EXT_FAN_ACC_BASE + i, ++ point->accel); ++ ecram_write(ecram, model->registers->EXT_FAN_DEC_BASE + i, ++ point->decel); ++ ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP + i, ++ point->cpu_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i, ++ point->cpu_min_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP + i, ++ point->gpu_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i, ++ point->gpu_min_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_VRM_TEMP + i, ++ point->ic_max_temp_celsius); ++ ecram_write(ecram, model->registers->EXT_VRM_TEMP_HYST + i, ++ point->ic_min_temp_celsius); ++ } ++ ++ if (write_size) { ++ ecram_write(ecram, model->registers->EXT_FAN_POINTS_SIZE, ++ fancurve->size); ++ } ++ ++ // Reset current fan level to 0, so algorithm in EC ++ // selects fan curve point again and resetting hysterisis ++ // effects ++ ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0); ++ ++ // Reset internal fan levels ++ ecram_write(ecram, 0xC634, 0); // CPU ++ ecram_write(ecram, 0xC635, 0); // GPU ++ ecram_write(ecram, 0xC636, 0); // SENSOR ++ ++ return 0; ++} ++ ++static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve, ++ struct seq_file *s) ++{ ++ int i; ++ ++ seq_printf( ++ s, ++ "rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n"); ++ for (i = 0; i < fancurve->size; ++i) { ++ const struct fancurve_point *point = &fancurve->points[i]; ++ ++ seq_printf( ++ s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n", ++ point->rpm1_raw * 100, point->rpm2_raw * 100, ++ point->accel, point->decel, point->cpu_min_temp_celsius, ++ point->cpu_max_temp_celsius, ++ point->gpu_min_temp_celsius, ++ point->gpu_max_temp_celsius, point->ic_min_temp_celsius, ++ point->ic_max_temp_celsius); ++ } ++ return 0; ++} ++ ++/* ============================= */ ++/* Global and shared data between */ ++/* all calls to this module */ ++/* ============================ */ ++// Implemented like ideapad-laptop.c but currenlty still ++// wihtout dynamic memory allocation (instaed global _priv) ++ ++struct legion_private { ++ struct platform_device *platform_device; ++ // TODO: remove or keep? init? ++ // struct acpi_device *adev; ++ ++ // Method to access ECRAM ++ struct ecram ecram; ++ // Configuration with registers an ECRAM access method ++ const struct model_config *conf; ++ ++ // TODO: maybe refactor an keep only local to each function ++ // last known fan curve ++ struct fancurve fancurve; ++ // configured fan curve from user space ++ struct fancurve fancurve_configured; ++ ++ // update lock, when partial values of fancurve are changed ++ struct mutex fancurve_mutex; ++ ++ //interfaces ++ struct dentry *debugfs_dir; ++ struct device *hwmon_dev; ++ struct platform_profile_handler platform_profile_handler; ++ ++ // TODO: remove? ++ bool loaded; ++}; ++ ++// shared between different drivers: WMI, platform and proteced by mutex ++static struct legion_private *legion_shared; ++static struct legion_private _priv; ++static DEFINE_MUTEX(legion_shared_mutex); ++ ++static int legion_shared_init(struct legion_private *priv) ++{ ++ int ret; ++ ++ mutex_lock(&legion_shared_mutex); ++ ++ if (!legion_shared) { ++ legion_shared = priv; ++ mutex_init(&legion_shared->fancurve_mutex); ++ ret = 0; ++ } else { ++ pr_warn("Found multiple platform devices\n"); ++ ret = -EINVAL; ++ } ++ ++ priv->loaded = true; ++ mutex_unlock(&legion_shared_mutex); ++ ++ return ret; ++} ++ ++static void legion_shared_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion shared\n"); ++ mutex_lock(&legion_shared_mutex); ++ ++ if (legion_shared == priv) ++ legion_shared = NULL; ++ ++ mutex_unlock(&legion_shared_mutex); ++ pr_info("Unloading legion shared done\n"); ++} ++ ++/* ============================= */ ++/* debugfs interface */ ++/* ============================ */ ++ ++static int debugfs_ecmemory_show(struct seq_file *s, void *unused) ++{ ++ struct legion_private *priv = s->private; ++ size_t offset; ++ ++ for (offset = 0; offset < priv->conf->memoryio_size; ++offset) { ++ char value = ecram_read(&priv->ecram, ++ priv->conf->memoryio_physical_ec_start + ++ offset); ++ ++ seq_write(s, &value, 1); ++ } ++ return 0; ++} ++ ++DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory); ++ ++static int debugfs_fancurve_show(struct seq_file *s, void *unused) ++{ ++ struct legion_private *priv = s->private; ++ bool is_minifancurve; ++ bool is_lockfancontroller; ++ bool is_maximumfanspeed; ++ int err; ++ ++ seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf)); ++ seq_printf(s, "EC Chip Version: %x\n", ++ read_ec_version(&priv->ecram, priv->conf)); ++ seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES); ++ seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly); ++ read_fancurve(&priv->ecram, priv->conf, &priv->fancurve); ++ ++ seq_printf(s, "minifancurve feature enabled: %d\n", ++ priv->conf->has_minifancurve); ++ err = read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve); ++ seq_printf(s, "minifancurve on cool: %s\n", ++ err ? "error" : (is_minifancurve ? "true" : "false")); ++ err = read_lockfancontroller(&priv->ecram, priv->conf, ++ &is_lockfancontroller); ++ seq_printf(s, "lock fan controller: %s\n", ++ err ? "error" : (is_lockfancontroller ? "true" : "false")); ++ err = read_maximumfanspeed(&priv->ecram, priv->conf, ++ &is_maximumfanspeed); ++ seq_printf(s, "enable maximumfanspeed: %s\n", ++ err ? "error" : (is_maximumfanspeed ? "true" : "false")); ++ seq_printf(s, "enable maximumfanspeed status: %d\n", err); ++ ++ seq_printf(s, "fan curve current point id: %ld\n", ++ priv->fancurve.current_point_i); ++ seq_printf(s, "fan curve points size: %ld\n", priv->fancurve.size); ++ ++ seq_puts(s, "Current fan curve in hardware (embedded controller):\n"); ++ fancurve_print_seqfile(&priv->fancurve, s); ++ seq_puts(s, "=====================\n"); ++ return 0; ++} ++ ++DEFINE_SHOW_ATTRIBUTE(debugfs_fancurve); ++ ++static void legion_debugfs_init(struct legion_private *priv) ++{ ++ struct dentry *dir; ++ ++ // TODO: remove this note ++ // Note: as other kernel modules, do not catch errors here ++ // because if kernel is build without debugfs this ++ // will return an error but module still has to ++ // work, just without debugfs ++ // TODO: what permissions; some modules do 400 ++ // other do 444 ++ dir = debugfs_create_dir(LEGION_DRVR_SHORTNAME, NULL); ++ debugfs_create_file("fancurve", 0444, dir, priv, ++ &debugfs_fancurve_fops); ++ debugfs_create_file("ecmemory", 0444, dir, priv, ++ &debugfs_ecmemory_fops); ++ ++ priv->debugfs_dir = dir; ++} ++ ++static void legion_debugfs_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion dubugfs\n"); ++ // The following is does nothing if pointer is NULL ++ debugfs_remove_recursive(priv->debugfs_dir); ++ priv->debugfs_dir = NULL; ++ pr_info("Unloading legion dubugfs done\n"); ++} ++ ++/* ============================= */ ++/* sysfs interface */ ++/* ============================ */ ++ ++static ssize_t powermode_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int power_mode = read_powermode(&priv->ecram, priv->conf); ++ ++ return sysfs_emit(buf, "%d\n", power_mode); ++} ++ ++static ssize_t powermode_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int powermode; ++ int err; ++ ++ err = kstrtouint(buf, 0, &powermode); ++ if (err) ++ return err; ++ ++ err = write_powermode(&priv->ecram, priv->conf, powermode); ++ if (err) ++ return -EINVAL; ++ ++ // TODO: better? ++ // we have to wait a bit before change is done in hardware and ++ // readback done after notifying returns correct value, otherwise ++ // the notified reader will read old value ++ msleep(500); ++ platform_profile_notify(); ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(powermode); ++ ++static ssize_t lockfancontroller_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ bool is_lockfancontroller; ++ int err; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_lockfancontroller(&priv->ecram, priv->conf, ++ &is_lockfancontroller); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ return sysfs_emit(buf, "%d\n", is_lockfancontroller); ++} ++ ++static ssize_t lockfancontroller_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ bool is_lockfancontroller; ++ int err; ++ ++ err = kstrtobool(buf, &is_lockfancontroller); ++ if (err) ++ return err; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = write_lockfancontroller(&priv->ecram, priv->conf, ++ is_lockfancontroller); ++ mutex_unlock(&priv->fancurve_mutex); ++ if (err) ++ return -EINVAL; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(lockfancontroller); ++ ++static ssize_t keyboard_backlight_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int state; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ read_keyboard_backlight(&priv->ecram, priv->conf, &state); ++ return sysfs_emit(buf, "%d\n", state); ++} ++ ++static ssize_t keyboard_backlight_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int state; ++ int err; ++ ++ err = kstrtouint(buf, 0, &state); ++ if (err) ++ return err; ++ ++ err = write_keyboard_backlight(&priv->ecram, priv->conf, state); ++ if (err) ++ return -EINVAL; ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(keyboard_backlight); ++ ++static struct attribute *legion_sysfs_attributes[] = { ++ &dev_attr_powermode.attr, &dev_attr_lockfancontroller.attr, ++ &dev_attr_keyboard_backlight.attr, NULL ++}; ++ ++static const struct attribute_group legion_attribute_group = { ++ .attrs = legion_sysfs_attributes ++}; ++ ++static int legion_sysfs_init(struct legion_private *priv) ++{ ++ return device_add_group(&priv->platform_device->dev, ++ &legion_attribute_group); ++} ++ ++static void legion_sysfs_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion sysfs\n"); ++ device_remove_group(&priv->platform_device->dev, ++ &legion_attribute_group); ++ pr_info("Unloading legion sysfs done\n"); ++} ++ ++/* ============================= */ ++/* WMI + ACPI */ ++/* ============================ */ ++// heavily based on ideapad_laptop.c ++ ++// TODO: proper names if meaning of all events is clear ++enum LEGION_WMI_EVENT { ++ LEGION_WMI_EVENT_GAMEZONE = 1, ++ LEGION_EVENT_A, ++ LEGION_EVENT_B, ++ LEGION_EVENT_C, ++ LEGION_EVENT_D, ++ LEGION_EVENT_E, ++ LEGION_EVENT_F, ++ LEGION_EVENT_G ++}; ++ ++struct legion_wmi_private { ++ enum LEGION_WMI_EVENT event; ++}; ++ ++//static void legion_wmi_notify2(u32 value, void *context) ++// { ++// pr_info("WMI notify\n" ); ++// } ++ ++static void legion_wmi_notify(struct wmi_device *wdev, union acpi_object *data) ++{ ++ struct legion_wmi_private *wpriv; ++ struct legion_private *priv; ++ ++ mutex_lock(&legion_shared_mutex); ++ priv = legion_shared; ++ if ((!priv) && (priv->loaded)) { ++ pr_info("Received WMI event while not initialized!\n"); ++ goto unlock; ++ } ++ ++ wpriv = dev_get_drvdata(&wdev->dev); ++ switch (wpriv->event) { ++ case LEGION_EVENT_A: ++ pr_info("Fan event: legion type: %d; acpi type: %d (%d=integer)", ++ wpriv->event, data->type, ACPI_TYPE_INTEGER); ++ // TODO: here it is too early (first unlock mutext, then wait a bit) ++ //platform_profile_notify(); ++ break; ++ default: ++ pr_info("Event: legion type: %d; acpi type: %d (%d=integer)", ++ wpriv->event, data->type, ACPI_TYPE_INTEGER); ++ break; ++ } ++ ++unlock: ++ mutex_unlock(&legion_shared_mutex); ++ // todo; fix that! ++ // problem: we get a event just before the powermode change (from the key?), ++ // so if we notify to early, it will read the old power mode/platform profile ++ msleep(500); ++ platform_profile_notify(); ++} ++ ++static int legion_wmi_probe(struct wmi_device *wdev, const void *context) ++{ ++ struct legion_wmi_private *wpriv; ++ ++ wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL); ++ if (!wpriv) ++ return -ENOMEM; ++ ++ *wpriv = *(const struct legion_wmi_private *)context; ++ ++ dev_set_drvdata(&wdev->dev, wpriv); ++ dev_info(&wdev->dev, "Register after probing for WMI.\n"); ++ return 0; ++} ++ ++static const struct legion_wmi_private legion_wmi_context_gamezone = { ++ .event = LEGION_WMI_EVENT_GAMEZONE ++}; ++static const struct legion_wmi_private legion_wmi_context_a = { ++ .event = LEGION_EVENT_A ++}; ++static const struct legion_wmi_private legion_wmi_context_b = { ++ .event = LEGION_EVENT_B ++}; ++static const struct legion_wmi_private legion_wmi_context_c = { ++ .event = LEGION_EVENT_C ++}; ++static const struct legion_wmi_private legion_wmi_context_d = { ++ .event = LEGION_EVENT_D ++}; ++static const struct legion_wmi_private legion_wmi_context_e = { ++ .event = LEGION_EVENT_E ++}; ++static const struct legion_wmi_private legion_wmi_context_f = { ++ .event = LEGION_EVENT_F ++}; ++ ++// check if really a method ++#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0" ++ ++#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F" ++#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59" ++#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \ ++ "10afc6d9-ea8b-4590-a2e7-1cd3c84bb4b1" ++#define LEGION_WMI_GUID_GAMEZONE_GPU_EVENT \ ++ "bfd42481-aee3-4502-a107-afb68425c5f8" ++#define LEGION_WMI_GUID_GAMEZONE_OC_EVENT "d062906b-12d4-4510-999d-4831ee80e985" ++#define LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT \ ++ "bfd42481-aee3-4501-a107-afb68425c5f8" ++//#define LEGION_WMI_GUID_GAMEZONE_DATA_EVENT "887b54e3-dddc-4b2c-8b88-68a26a8835d0" ++ ++static const struct wmi_device_id legion_wmi_ids[] = { ++ { LEGION_WMI_GAMEZONE_GUID, &legion_wmi_context_gamezone }, ++ { LEGION_WMI_GUID_FAN_EVENT, &legion_wmi_context_a }, ++ { LEGION_WMI_GUID_FAN2_EVENT, &legion_wmi_context_b }, ++ { LEGION_WMI_GUID_GAMEZONE_KEY_EVENT, &legion_wmi_context_c }, ++ { LEGION_WMI_GUID_GAMEZONE_GPU_EVENT, &legion_wmi_context_d }, ++ { LEGION_WMI_GUID_GAMEZONE_OC_EVENT, &legion_wmi_context_e }, ++ { LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT, &legion_wmi_context_f }, ++ { "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294", ++ &legion_wmi_context_gamezone }, /* Legion 5 */ ++ {}, ++}; ++MODULE_DEVICE_TABLE(wmi, legion_wmi_ids); ++ ++static struct wmi_driver legion_wmi_driver = { ++ .driver = { ++ .name = "legion_wmi", ++ }, ++ .id_table = legion_wmi_ids, ++ .probe = legion_wmi_probe, ++ .notify = legion_wmi_notify, ++}; ++ ++//acpi_status status = wmi_install_notify_handler(LEGION_WMI_GAMEZONE_GUID, ++// legion_wmi_notify2, NULL); ++//if (ACPI_FAILURE(status)) { ++// return -ENODEV; ++//} ++//return 0; ++ ++static int legion_wmi_init(void) ++{ ++ return wmi_driver_register(&legion_wmi_driver); ++} ++ ++static void legion_wmi_exit(void) ++{ ++ // TODO: remove this ++ pr_info("Unloading legion WMI\n"); ++ ++ //wmi_remove_notify_handler(LEGION_WMI_GAMEZONE_GUID); ++ wmi_driver_unregister(&legion_wmi_driver); ++ pr_info("Unloading legion WMI done\n"); ++} ++ ++/* ============================= */ ++/* Platform profile */ ++/* ============================ */ ++ ++enum LEGION_POWERMODE { ++ LEGION_POWERMODE_BALANCED = 0, ++ LEGION_POWERMODE_PERFORMANCE = 1, ++ LEGION_POWERMODE_QUIET = 2, ++}; ++ ++static int legion_platform_profile_get(struct platform_profile_handler *pprof, ++ enum platform_profile_option *profile) ++{ ++ int powermode; ++ struct legion_private *priv; ++ ++ priv = container_of(pprof, struct legion_private, ++ platform_profile_handler); ++ powermode = read_powermode(&priv->ecram, priv->conf); ++ ++ switch (powermode) { ++ case LEGION_POWERMODE_BALANCED: ++ *profile = PLATFORM_PROFILE_BALANCED; ++ break; ++ case LEGION_POWERMODE_PERFORMANCE: ++ *profile = PLATFORM_PROFILE_PERFORMANCE; ++ break; ++ case LEGION_POWERMODE_QUIET: ++ *profile = PLATFORM_PROFILE_QUIET; ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++static int legion_platform_profile_set(struct platform_profile_handler *pprof, ++ enum platform_profile_option profile) ++{ ++ int powermode; ++ struct legion_private *priv; ++ ++ priv = container_of(pprof, struct legion_private, ++ platform_profile_handler); ++ ++ switch (profile) { ++ case PLATFORM_PROFILE_BALANCED: ++ powermode = LEGION_POWERMODE_BALANCED; ++ break; ++ case PLATFORM_PROFILE_PERFORMANCE: ++ powermode = LEGION_POWERMODE_PERFORMANCE; ++ break; ++ case PLATFORM_PROFILE_QUIET: ++ powermode = LEGION_POWERMODE_QUIET; ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return write_powermode(&priv->ecram, priv->conf, powermode); ++} ++ ++static int legion_platform_profile_init(struct legion_private *priv) ++{ ++ int err; ++ ++ priv->platform_profile_handler.profile_get = ++ legion_platform_profile_get; ++ priv->platform_profile_handler.profile_set = ++ legion_platform_profile_set; ++ ++ set_bit(PLATFORM_PROFILE_QUIET, priv->platform_profile_handler.choices); ++ set_bit(PLATFORM_PROFILE_BALANCED, ++ priv->platform_profile_handler.choices); ++ set_bit(PLATFORM_PROFILE_PERFORMANCE, ++ priv->platform_profile_handler.choices); ++ ++ err = platform_profile_register(&priv->platform_profile_handler); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static void legion_platform_profile_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion platform profile\n"); ++ platform_profile_remove(); ++ pr_info("Unloading legion platform profile done\n"); ++} ++ ++/* ============================= */ ++/* hwom interface */ ++/* ============================ */ ++ ++// hw-mon interface ++ ++// todo: register_group or register_info? ++ ++// TODO: use one common function (like here) or one function per attribute? ++static ssize_t sensor_label_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int sensor_id = (to_sensor_dev_attr(attr))->index; ++ const char *label; ++ ++ switch (sensor_id) { ++ case SENSOR_CPU_TEMP_ID: ++ label = "CPU Temperature\n"; ++ break; ++ case SENSOR_GPU_TEMP_ID: ++ label = "GPU Temperature\n"; ++ break; ++ case SENSOR_IC_TEMP_ID: ++ label = "IC Temperature\n"; ++ break; ++ case SENSOR_FAN1_RPM_ID: ++ label = "Fan 1\n"; ++ break; ++ case SENSOR_FAN2_RPM_ID: ++ label = "Fan 2\n"; ++ break; ++ case SENSOR_FAN1_TARGET_RPM_ID: ++ label = "Fan 1 Target\n"; ++ break; ++ case SENSOR_FAN2_TARGET_RPM_ID: ++ label = "Fan 2 Target\n"; ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return sprintf(buf, label); ++} ++ ++// TODO: use one common function (like here) or one function per attribute? ++static ssize_t sensor_show(struct device *dev, struct device_attribute *devattr, ++ char *buf) ++{ ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int sensor_id = (to_sensor_dev_attr(devattr))->index; ++ struct sensor_values values; ++ int outval; ++ ++ read_sensor_values(&priv->ecram, priv->conf, &values); ++ ++ switch (sensor_id) { ++ case SENSOR_CPU_TEMP_ID: ++ outval = 1000 * values.cpu_temp_celsius; ++ break; ++ case SENSOR_GPU_TEMP_ID: ++ outval = 1000 * values.gpu_temp_celsius; ++ break; ++ case SENSOR_IC_TEMP_ID: ++ outval = 1000 * values.ic_temp_celsius; ++ break; ++ case SENSOR_FAN1_RPM_ID: ++ outval = values.fan1_rpm; ++ break; ++ case SENSOR_FAN2_RPM_ID: ++ outval = values.fan2_rpm; ++ break; ++ case SENSOR_FAN1_TARGET_RPM_ID: ++ outval = values.fan1_target_rpm; ++ break; ++ case SENSOR_FAN2_TARGET_RPM_ID: ++ outval = values.fan2_target_rpm; ++ break; ++ default: ++ pr_info("Error reading sensor value with id %d\n", sensor_id); ++ return -EOPNOTSUPP; ++ } ++ ++ return sprintf(buf, "%d\n", outval); ++} ++ ++static SENSOR_DEVICE_ATTR_RO(temp1_input, sensor, SENSOR_CPU_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(temp1_label, sensor_label, SENSOR_CPU_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(temp2_input, sensor, SENSOR_GPU_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(temp2_label, sensor_label, SENSOR_GPU_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(temp3_input, sensor, SENSOR_IC_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(temp3_label, sensor_label, SENSOR_IC_TEMP_ID); ++static SENSOR_DEVICE_ATTR_RO(fan1_input, sensor, SENSOR_FAN1_RPM_ID); ++static SENSOR_DEVICE_ATTR_RO(fan1_label, sensor_label, SENSOR_FAN1_RPM_ID); ++static SENSOR_DEVICE_ATTR_RO(fan2_input, sensor, SENSOR_FAN2_RPM_ID); ++static SENSOR_DEVICE_ATTR_RO(fan2_label, sensor_label, SENSOR_FAN2_RPM_ID); ++static SENSOR_DEVICE_ATTR_RO(fan1_target, sensor, SENSOR_FAN1_TARGET_RPM_ID); ++static SENSOR_DEVICE_ATTR_RO(fan2_target, sensor, SENSOR_FAN2_TARGET_RPM_ID); ++ ++static struct attribute *sensor_hwmon_attributes[] = { ++ &sensor_dev_attr_temp1_input.dev_attr.attr, ++ &sensor_dev_attr_temp1_label.dev_attr.attr, ++ &sensor_dev_attr_temp2_input.dev_attr.attr, ++ &sensor_dev_attr_temp2_label.dev_attr.attr, ++ &sensor_dev_attr_temp3_input.dev_attr.attr, ++ &sensor_dev_attr_temp3_label.dev_attr.attr, ++ &sensor_dev_attr_fan1_input.dev_attr.attr, ++ &sensor_dev_attr_fan1_label.dev_attr.attr, ++ &sensor_dev_attr_fan2_input.dev_attr.attr, ++ &sensor_dev_attr_fan2_label.dev_attr.attr, ++ &sensor_dev_attr_fan1_target.dev_attr.attr, ++ &sensor_dev_attr_fan2_target.dev_attr.attr, ++ NULL ++}; ++ ++static ssize_t autopoint_show(struct device *dev, ++ struct device_attribute *devattr, char *buf) ++{ ++ struct fancurve fancurve; ++ int err; ++ int value; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr; ++ int point_id = to_sensor_dev_attr_2(devattr)->index; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_fancurve(&priv->ecram, priv->conf, &fancurve); ++ mutex_unlock(&priv->fancurve_mutex); ++ ++ if (err) { ++ pr_info("Reading fancurve failed\n"); ++ return -EOPNOTSUPP; ++ } ++ if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) { ++ pr_info("Reading fancurve failed due to wrong point id: %d\n", ++ point_id); ++ return -EOPNOTSUPP; ++ } ++ ++ switch (fancurve_attr_id) { ++ case FANCURVE_ATTR_PWM1: ++ value = fancurve.points[point_id].rpm1_raw * 100; ++ break; ++ case FANCURVE_ATTR_PWM2: ++ value = fancurve.points[point_id].rpm2_raw * 100; ++ break; ++ case FANCURVE_ATTR_CPU_TEMP: ++ value = fancurve.points[point_id].cpu_max_temp_celsius; ++ break; ++ case FANCURVE_ATTR_CPU_HYST: ++ value = fancurve.points[point_id].cpu_min_temp_celsius; ++ break; ++ case FANCURVE_ATTR_GPU_TEMP: ++ value = fancurve.points[point_id].gpu_max_temp_celsius; ++ break; ++ case FANCURVE_ATTR_GPU_HYST: ++ value = fancurve.points[point_id].gpu_min_temp_celsius; ++ break; ++ case FANCURVE_ATTR_IC_TEMP: ++ value = fancurve.points[point_id].ic_max_temp_celsius; ++ break; ++ case FANCURVE_ATTR_IC_HYST: ++ value = fancurve.points[point_id].ic_min_temp_celsius; ++ break; ++ case FANCURVE_ATTR_ACCEL: ++ value = fancurve.points[point_id].accel; ++ break; ++ case FANCURVE_ATTR_DECEL: ++ value = fancurve.points[point_id].decel; ++ break; ++ case FANCURVE_SIZE: ++ value = fancurve.size; ++ break; ++ default: ++ pr_info("Reading fancurve failed due to wrong attribute id: %d\n", ++ fancurve_attr_id); ++ return -EOPNOTSUPP; ++ } ++ ++ return sprintf(buf, "%d\n", value); ++} ++ ++static ssize_t autopoint_store(struct device *dev, ++ struct device_attribute *devattr, ++ const char *buf, size_t count) ++{ ++ struct fancurve fancurve; ++ int err; ++ int value; ++ bool valid; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr; ++ int point_id = to_sensor_dev_attr_2(devattr)->index; ++ ++ if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) { ++ pr_info("Reading fancurve failed due to wrong point id: %d\n", ++ point_id); ++ err = -EOPNOTSUPP; ++ goto error; ++ } ++ ++ err = kstrtoint(buf, 0, &value); ++ if (err) { ++ pr_info("Parse for hwmon store is not succesful: error:%d; point_id: %d; fancurve_attr_id: %d\\n", ++ err, point_id, fancurve_attr_id); ++ goto error; ++ } ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_fancurve(&priv->ecram, priv->conf, &fancurve); ++ ++ if (err) { ++ pr_info("Reading fancurve failed\n"); ++ err = -EOPNOTSUPP; ++ goto error_mutex; ++ } ++ ++ switch (fancurve_attr_id) { ++ case FANCURVE_ATTR_PWM1: ++ valid = fancurve_set_rpm1(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_PWM2: ++ valid = fancurve_set_rpm2(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_CPU_TEMP: ++ valid = fancurve_set_cpu_temp_max(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_CPU_HYST: ++ valid = fancurve_set_cpu_temp_min(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_GPU_TEMP: ++ valid = fancurve_set_gpu_temp_max(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_GPU_HYST: ++ valid = fancurve_set_gpu_temp_min(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_IC_TEMP: ++ valid = fancurve_set_ic_temp_max(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_IC_HYST: ++ valid = fancurve_set_ic_temp_min(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_ACCEL: ++ valid = fancurve_set_accel(&fancurve, point_id, value); ++ break; ++ case FANCURVE_ATTR_DECEL: ++ valid = fancurve_set_decel(&fancurve, point_id, value); ++ break; ++ case FANCURVE_SIZE: ++ valid = fancurve_set_size(&fancurve, value, true); ++ break; ++ default: ++ pr_info("Writing fancurve failed due to wrong attribute id: %d\n", ++ fancurve_attr_id); ++ err = -EOPNOTSUPP; ++ goto error_mutex; ++ } ++ ++ if (!valid) { ++ pr_info("Ignoring invalid fancurve value %d for attribute %d at point %d\n", ++ value, fancurve_attr_id, point_id); ++ err = -EOPNOTSUPP; ++ goto error_mutex; ++ } ++ ++ err = write_fancurve(&priv->ecram, priv->conf, &fancurve, false); ++ if (err) { ++ pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n", ++ point_id); ++ err = -EOPNOTSUPP; ++ goto error_mutex; ++ } ++ ++ mutex_unlock(&priv->fancurve_mutex); ++ return count; ++ ++error_mutex: ++ mutex_unlock(&priv->fancurve_mutex); ++error: ++ return count; ++} ++ ++// rpm1 ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_pwm, autopoint, ++ FANCURVE_ATTR_PWM1, 9); ++// rpm2 ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_pwm, autopoint, ++ FANCURVE_ATTR_PWM2, 9); ++// CPU temp ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp, autopoint, ++ FANCURVE_ATTR_CPU_TEMP, 9); ++// CPU temp hyst ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp_hyst, autopoint, ++ FANCURVE_ATTR_CPU_HYST, 9); ++// GPU temp ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp, autopoint, ++ FANCURVE_ATTR_GPU_TEMP, 9); ++// GPU temp hyst ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp_hyst, autopoint, ++ FANCURVE_ATTR_GPU_HYST, 9); ++// IC temp ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp, autopoint, ++ FANCURVE_ATTR_IC_TEMP, 9); ++// IC temp hyst ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp_hyst, autopoint, ++ FANCURVE_ATTR_IC_HYST, 9); ++// accel ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_accel, autopoint, ++ FANCURVE_ATTR_ACCEL, 9); ++// decel ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 0); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 1); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 2); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 3); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 4); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 5); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 6); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 7); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 8); ++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_decel, autopoint, ++ FANCURVE_ATTR_DECEL, 9); ++//size ++static SENSOR_DEVICE_ATTR_2_RW(auto_points_size, autopoint, FANCURVE_SIZE, 0); ++ ++static ssize_t minifancurve_show(struct device *dev, ++ struct device_attribute *devattr, char *buf) ++{ ++ bool value; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_minifancurve(&priv->ecram, priv->conf, &value); ++ if (err) { ++ err = -1; ++ pr_info("Reading minifancurve not succesful\n"); ++ goto error_unlock; ++ } ++ mutex_unlock(&priv->fancurve_mutex); ++ return sprintf(buf, "%d\n", value); ++ ++error_unlock: ++ mutex_unlock(&priv->fancurve_mutex); ++ return -1; ++} ++ ++static ssize_t minifancurve_store(struct device *dev, ++ struct device_attribute *devattr, ++ const char *buf, size_t count) ++{ ++ int value; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ err = kstrtoint(buf, 0, &value); ++ if (err) { ++ err = -1; ++ pr_info("Parse for hwmon store is not succesful: error:%d\n", ++ err); ++ goto error; ++ } ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = write_minifancurve(&priv->ecram, priv->conf, value); ++ if (err) { ++ err = -1; ++ pr_info("Writing minifancurve not succesful\n"); ++ goto error_unlock; ++ } ++ mutex_unlock(&priv->fancurve_mutex); ++ return count; ++ ++error_unlock: ++ mutex_unlock(&priv->fancurve_mutex); ++error: ++ return err; ++} ++ ++static SENSOR_DEVICE_ATTR_RW(minifancurve, minifancurve, 0); ++ ++static ssize_t pwm1_mode_show(struct device *dev, ++ struct device_attribute *devattr, char *buf) ++{ ++ bool value; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = read_maximumfanspeed(&priv->ecram, priv->conf, &value); ++ if (err) { ++ err = -1; ++ pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n"); ++ goto error_unlock; ++ } ++ mutex_unlock(&priv->fancurve_mutex); ++ return sprintf(buf, "%d\n", value ? 0 : 2); ++ ++error_unlock: ++ mutex_unlock(&priv->fancurve_mutex); ++ return -1; ++} ++ ++static ssize_t pwm1_mode_store(struct device *dev, ++ struct device_attribute *devattr, ++ const char *buf, size_t count) ++{ ++ int value; ++ int is_maximumfanspeed; ++ int err; ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ err = kstrtoint(buf, 0, &value); ++ if (err) { ++ err = -1; ++ pr_info("Parse for hwmon store is not succesful: error:%d\n", ++ err); ++ goto error; ++ } ++ is_maximumfanspeed = value == 0; ++ ++ mutex_lock(&priv->fancurve_mutex); ++ err = write_maximumfanspeed(&priv->ecram, priv->conf, ++ is_maximumfanspeed); ++ if (err) { ++ err = -1; ++ pr_info("Writing pwm1_mode/maximumfanspeed not succesful\n"); ++ goto error_unlock; ++ } ++ mutex_unlock(&priv->fancurve_mutex); ++ return count; ++ ++error_unlock: ++ mutex_unlock(&priv->fancurve_mutex); ++error: ++ return err; ++} ++ ++static SENSOR_DEVICE_ATTR_RW(pwm1_mode, pwm1_mode, 0); ++ ++static struct attribute *fancurve_hwmon_attributes[] = { ++ &sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point7_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point8_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point9_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point10_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point1_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point2_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point3_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point4_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point5_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point6_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point7_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point8_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point9_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point10_pwm.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point7_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point8_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point9_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point10_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point1_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point2_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point3_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point4_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point5_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point6_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point7_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point8_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point9_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point10_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point1_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point2_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point3_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point4_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point5_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point6_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point7_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point8_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point9_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point10_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point1_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point2_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point3_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point4_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point5_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point6_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point7_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point8_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point9_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm2_auto_point10_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point1_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point2_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point3_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point4_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point5_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point6_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point7_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point8_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point9_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point10_temp.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point1_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point2_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point3_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point4_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point5_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point6_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point7_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point8_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point9_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm3_auto_point10_temp_hyst.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point1_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point2_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point3_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point4_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point5_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point6_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point7_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point8_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point9_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point10_accel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point1_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point2_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point3_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point4_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point5_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point6_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point7_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point8_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point9_decel.dev_attr.attr, ++ &sensor_dev_attr_pwm1_auto_point10_decel.dev_attr.attr, ++ // ++ &sensor_dev_attr_auto_points_size.dev_attr.attr, ++ &sensor_dev_attr_minifancurve.dev_attr.attr, ++ &sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL ++}; ++ ++static umode_t legion_is_visible(struct kobject *kobj, struct attribute *attr, ++ int idx) ++{ ++ bool supported = true; ++ struct device *dev = kobj_to_dev(kobj); ++ struct legion_private *priv = dev_get_drvdata(dev); ++ ++ if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr) ++ supported = priv->conf->has_minifancurve; ++ ++ return supported ? attr->mode : 0; ++} ++ ++static const struct attribute_group legion_hwmon_sensor_group = { ++ .attrs = sensor_hwmon_attributes, ++ .is_visible = NULL ++}; ++ ++static const struct attribute_group legion_hwmon_fancurve_group = { ++ .attrs = fancurve_hwmon_attributes, ++ .is_visible = legion_is_visible, ++}; ++ ++static const struct attribute_group *legion_hwmon_groups[] = { ++ &legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL ++}; ++ ++ssize_t legion_hwmon_init(struct legion_private *priv) ++{ ++ //TODO: use hwmon_device_register_with_groups or ++ // hwmon_device_register_with_info (latter means all hwmon functions have to be ++ // changed) ++ // some laptop driver do it in one way, some in the other ++ // TODO: Use devm_hwmon_device_register_with_groups ? ++ // some laptop drivers use this, some ++ struct device *hwmon_dev = hwmon_device_register_with_groups( ++ &priv->platform_device->dev, "legion_hwmon", priv, ++ legion_hwmon_groups); ++ if (IS_ERR_OR_NULL(hwmon_dev)) { ++ pr_err("hwmon_device_register failed!\n"); ++ return PTR_ERR(hwmon_dev); ++ } ++ dev_set_drvdata(hwmon_dev, priv); ++ priv->hwmon_dev = hwmon_dev; ++ return 0; ++} ++ ++void legion_hwmon_exit(struct legion_private *priv) ++{ ++ pr_info("Unloading legion hwon\n"); ++ if (priv->hwmon_dev) { ++ hwmon_device_unregister(priv->hwmon_dev); ++ priv->hwmon_dev = NULL; ++ } ++ pr_info("Unloading legion hwon done\n"); ++} ++ ++/* ============================= */ ++/* Platform driver */ ++/* ============================ */ ++ ++int legion_add(struct platform_device *pdev) ++{ ++ struct legion_private *priv; ++ const struct dmi_system_id *dmi_sys; ++ int err; ++ u16 ec_read_id; ++ bool is_denied = true; ++ bool is_allowed = false; ++ bool do_load_by_list = false; ++ bool do_load = false; ++ //struct legion_private *priv = dev_get_drvdata(&pdev->dev); ++ dev_info(&pdev->dev, "legion_laptop platform driver probing\n"); ++ ++ dev_info(&pdev->dev, "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n", ++ dmi_get_system_info(DMI_SYS_VENDOR), ++ dmi_get_system_info(DMI_PRODUCT_NAME), ++ dmi_get_system_info(DMI_BIOS_VERSION)); ++ ++ // TODO: allocate? ++ priv = &_priv; ++ priv->platform_device = pdev; ++ err = legion_shared_init(priv); ++ if (err) { ++ dev_info(&pdev->dev, "legion_laptop is forced to load.\n"); ++ goto err_legion_shared_init; ++ } ++ dev_set_drvdata(&pdev->dev, priv); ++ ++ // TODO: remove ++ pr_info("Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n", ++ dmi_get_system_info(DMI_SYS_VENDOR), ++ dmi_get_system_info(DMI_PRODUCT_NAME), ++ dmi_get_system_info(DMI_BIOS_VERSION)); ++ ++ dmi_sys = dmi_first_match(optimistic_allowlist); ++ is_allowed = dmi_sys != NULL; ++ is_denied = dmi_check_system(denylist); ++ do_load_by_list = is_allowed && !is_denied; ++ do_load = do_load_by_list || force; ++ ++ dev_info( ++ &pdev->dev, ++ "is_denied: %d; is_allowed: %d; do_load_by_list: %d; do_load: %d\n", ++ is_denied, is_allowed, do_load_by_list, do_load); ++ ++ if (!(do_load)) { ++ dev_info( ++ &pdev->dev, ++ "Module not useable for this laptop because it is not in allowlist. Notify maintainer if you want to add your device or force load with param force.\n"); ++ err = -ENOMEM; ++ goto err_model_mismtach; ++ } ++ ++ if (force) ++ dev_info(&pdev->dev, "legion_laptop is forced to load.\n"); ++ ++ if (!do_load_by_list && do_load) { ++ dev_info( ++ &pdev->dev, ++ "legion_laptop is forced to load and would otherwise be not loaded\n"); ++ } ++ ++ // if forced and no module found, use config for first model ++ if (dmi_sys == NULL) ++ dmi_sys = &optimistic_allowlist[0]; ++ dev_info(&pdev->dev, "Using configuration for system: %s\n", ++ dmi_sys->ident); ++ ++ priv->conf = dmi_sys->driver_data; ++ ++ err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start, ++ priv->conf->memoryio_size); ++ if (err) { ++ dev_info(&pdev->dev, ++ "Could not init access to embedded controller\n"); ++ goto err_ecram_init; ++ } ++ ++ ec_read_id = read_ec_id(&priv->ecram, priv->conf); ++ dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id); ++ if (priv->conf->check_embedded_controller_id && ++ !(ec_read_id == priv->conf->embedded_controller_id)) { ++ err = -ENOMEM; ++ dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n", ++ priv->conf->embedded_controller_id, ec_read_id); ++ goto err_ecram_id; ++ } ++ if (!priv->conf->check_embedded_controller_id) { ++ dev_info(&pdev->dev, ++ "Skipped checking embedded controller id\n"); ++ } ++ ++ dev_info(&pdev->dev, "Creating debugfs inteface\n"); ++ legion_debugfs_init(priv); ++ ++ pr_info("Creating sysfs inteface\n"); ++ err = legion_sysfs_init(priv); ++ if (err) { ++ dev_info(&pdev->dev, "Creating sysfs interface failed\n"); ++ goto err_sysfs_init; ++ } ++ ++ pr_info("Creating hwmon interface"); ++ err = legion_hwmon_init(priv); ++ if (err) ++ goto err_hwmon_init; ++ ++ pr_info("Creating platform profile support\n"); ++ err = legion_platform_profile_init(priv); ++ if (err) { ++ dev_info(&pdev->dev, "Creating platform profile failed\n"); ++ goto err_platform_profile; ++ } ++ ++ pr_info("Init WMI driver support\n"); ++ err = legion_wmi_init(); ++ if (err) { ++ dev_info(&pdev->dev, "Init WMI driver failed\n"); ++ goto err_wmi; ++ } ++ ++ dev_info(&pdev->dev, "legion_laptop loaded for this device\n"); ++ return 0; ++ ++ // TODO: remove eventually ++ legion_wmi_exit(); ++err_wmi: ++ legion_platform_profile_exit(priv); ++err_platform_profile: ++ legion_hwmon_exit(priv); ++err_hwmon_init: ++ legion_sysfs_exit(priv); ++err_sysfs_init: ++ legion_debugfs_exit(priv); ++err_ecram_id: ++ ecram_exit(&priv->ecram); ++err_ecram_init: ++ legion_shared_exit(priv); ++err_legion_shared_init: ++err_model_mismtach: ++ dev_info(&pdev->dev, "legion_laptop not loaded for this device\n"); ++ return err; ++} ++ ++int legion_remove(struct platform_device *pdev) ++{ ++ struct legion_private *priv = dev_get_drvdata(&pdev->dev); ++ ++ mutex_lock(&legion_shared_mutex); ++ priv->loaded = false; ++ mutex_unlock(&legion_shared_mutex); ++ ++ // first unregister wmi, so toggling powermode does not ++ // generate events anymore that even might be delayed ++ legion_wmi_exit(); ++ legion_platform_profile_exit(priv); ++ ++ // toggle power mode to load default setting from embedded controller ++ // again ++ toggle_powermode(&priv->ecram, priv->conf); ++ ++ legion_hwmon_exit(priv); ++ legion_sysfs_exit(priv); ++ legion_debugfs_exit(priv); ++ ecram_exit(&priv->ecram); ++ legion_shared_exit(priv); ++ ++ pr_info("Legion platform unloaded\n"); ++ return 0; ++} ++ ++int legion_resume(struct platform_device *pdev) ++{ ++ //struct legion_private *priv = dev_get_drvdata(&pdev->dev); ++ dev_info(&pdev->dev, "Resumed in legion-laptop\n"); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_PM_SLEEP ++static int legion_pm_resume(struct device *dev) ++{ ++ //struct legion_private *priv = dev_get_drvdata(dev); ++ dev_info(dev, "Resumed PM in legion-laptop\n"); ++ ++ return 0; ++} ++#endif ++static SIMPLE_DEV_PM_OPS(legion_pm, NULL, legion_pm_resume); ++ ++// same as ideapad ++static const struct acpi_device_id legion_device_ids[] = { ++ { "PNP0C09", 0 }, // todo: change to "VPC2004" ++ { "", 0 }, ++}; ++MODULE_DEVICE_TABLE(acpi, legion_device_ids); ++ ++static struct platform_driver legion_driver = { ++ .probe = legion_add, ++ .remove = legion_remove, ++ .resume = legion_resume, ++ .driver = { ++ .name = "legion", ++ .pm = &legion_pm, ++ .acpi_match_table = ACPI_PTR(legion_device_ids), ++ }, ++}; ++ ++int __init legion_init(void) ++{ ++ int err; ++ ++ pr_info("legion_laptop starts loading\n"); ++ err = platform_driver_register(&legion_driver); ++ if (err) { ++ pr_info("legion_laptop: platform_driver_register failed\n"); ++ return err; ++ } ++ ++ return 0; ++} ++ ++module_init(legion_init); ++ ++void __exit legion_exit(void) ++{ ++ platform_driver_unregister(&legion_driver); ++ pr_info("legion_laptop exit\n"); ++} ++ ++module_exit(legion_exit); diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c new file mode 100644 index 000000000000..77a6677ec19e @@ -7079,10 +9895,10 @@ index ab0c5bd1a60f..f4989f706d7f 100644 -- 2.40.1 -From a6fac309dae53f34208de29f5b82d053ca55eed6 Mon Sep 17 00:00:00 2001 +From 9e165ac849652399c952c5e1764ca9a7630a28c7 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 26 Apr 2023 22:04:18 +0200 -Subject: [PATCH 4/8] fixes +Date: Tue, 25 Apr 2023 17:17:39 +0200 +Subject: [PATCH 04/10] fixes Signed-off-by: Peter Jung --- @@ -9206,10 +12022,12146 @@ index 75020edd39e7..e4455220e9fd 100644 -- 2.40.1 -From 75780f643d87d4f249b25a14bcc99b767209fa2b Mon Sep 17 00:00:00 2001 +From d3a7d6477e59e6015a1e50ac35a341c4aa4c7324 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 28 Apr 2023 19:59:05 +0200 +Subject: [PATCH 05/10] fs-patches + +Signed-off-by: Peter Jung +--- + block/Kconfig | 3 + + block/blk-cgroup.c | 78 +- + block/blk-cgroup.h | 15 +- + block/blk-core.c | 3 - + fs/btrfs/Kconfig | 1 + + fs/btrfs/bio.c | 211 +- + fs/btrfs/bio.h | 22 +- + fs/btrfs/block-group.c | 40 +- + fs/btrfs/block-group.h | 13 +- + fs/btrfs/block-rsv.c | 21 +- + fs/btrfs/block-rsv.h | 2 +- + fs/btrfs/btrfs_inode.h | 35 +- + fs/btrfs/compression.c | 299 +-- + fs/btrfs/compression.h | 20 +- + fs/btrfs/ctree.c | 91 +- + fs/btrfs/ctree.h | 17 +- + fs/btrfs/delalloc-space.c | 2 +- + fs/btrfs/delayed-ref.c | 49 +- + fs/btrfs/delayed-ref.h | 22 +- + fs/btrfs/disk-io.c | 147 +- + fs/btrfs/extent-tree.c | 37 +- + fs/btrfs/extent_io.c | 550 ++-- + fs/btrfs/file-item.c | 93 +- + fs/btrfs/file-item.h | 3 +- + fs/btrfs/fs.h | 53 +- + fs/btrfs/inode-item.c | 15 +- + fs/btrfs/inode.c | 375 ++- + fs/btrfs/ioctl.c | 5 + + fs/btrfs/locking.c | 25 +- + fs/btrfs/locking.h | 5 +- + fs/btrfs/lru_cache.h | 5 - + fs/btrfs/lzo.c | 17 +- + fs/btrfs/messages.c | 2 +- + fs/btrfs/messages.h | 2 +- + fs/btrfs/ordered-data.c | 120 +- + fs/btrfs/ordered-data.h | 10 +- + fs/btrfs/raid56.c | 162 +- + fs/btrfs/raid56.h | 12 +- + fs/btrfs/relocation.c | 6 +- + fs/btrfs/scrub.c | 4142 +++++++++-------------------- + fs/btrfs/send.c | 2 +- + fs/btrfs/space-info.c | 32 +- + fs/btrfs/space-info.h | 1 + + fs/btrfs/super.c | 3 +- + fs/btrfs/sysfs.c | 5 + + fs/btrfs/tests/extent-map-tests.c | 1 - + fs/btrfs/transaction.c | 28 +- + fs/btrfs/tree-checker.c | 14 + + fs/btrfs/tree-log.c | 171 +- + fs/btrfs/volumes.c | 593 ++--- + fs/btrfs/volumes.h | 85 +- + fs/btrfs/zlib.c | 2 - + fs/btrfs/zoned.c | 4 +- + fs/btrfs/zstd.c | 1 - + include/linux/bio.h | 5 + + include/linux/blk_types.h | 18 +- + include/linux/crc32c.h | 1 - + include/linux/writeback.h | 5 - + include/trace/events/btrfs.h | 2 +- + include/uapi/linux/btrfs.h | 1 + + lib/libcrc32c.c | 6 - + tools/objtool/check.c | 1 + + 62 files changed, 2867 insertions(+), 4844 deletions(-) + +diff --git a/block/Kconfig b/block/Kconfig +index 941b2dca70db..69ccf7457ae1 100644 +--- a/block/Kconfig ++++ b/block/Kconfig +@@ -41,6 +41,9 @@ config BLK_RQ_ALLOC_TIME + config BLK_CGROUP_RWSTAT + bool + ++config BLK_CGROUP_PUNT_BIO ++ bool ++ + config BLK_DEV_BSG_COMMON + tristate + +diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c +index bd50b55bdb61..18c922579719 100644 +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -56,7 +56,6 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; + static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + + bool blkcg_debug_stats = false; +-static struct workqueue_struct *blkcg_punt_bio_wq; + + #define BLKG_DESTROY_BATCH_SIZE 64 + +@@ -166,7 +165,9 @@ static void __blkg_release(struct rcu_head *rcu) + { + struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + ++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO + WARN_ON(!bio_list_empty(&blkg->async_bios)); ++#endif + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); +@@ -188,6 +189,9 @@ static void blkg_release(struct percpu_ref *ref) + call_rcu(&blkg->rcu_head, __blkg_release); + } + ++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO ++static struct workqueue_struct *blkcg_punt_bio_wq; ++ + static void blkg_async_bio_workfn(struct work_struct *work) + { + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, +@@ -198,10 +202,10 @@ static void blkg_async_bio_workfn(struct work_struct *work) + bool need_plug = false; + + /* as long as there are pending bios, @blkg can't go away */ +- spin_lock_bh(&blkg->async_bio_lock); ++ spin_lock(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); +- spin_unlock_bh(&blkg->async_bio_lock); ++ spin_unlock(&blkg->async_bio_lock); + + /* start plug only when bio_list contains at least 2 bios */ + if (bios.head && bios.head->bi_next) { +@@ -214,6 +218,40 @@ static void blkg_async_bio_workfn(struct work_struct *work) + blk_finish_plug(&plug); + } + ++/* ++ * When a shared kthread issues a bio for a cgroup, doing so synchronously can ++ * lead to priority inversions as the kthread can be trapped waiting for that ++ * cgroup. Use this helper instead of submit_bio to punt the actual issuing to ++ * a dedicated per-blkcg work item to avoid such priority inversions. ++ */ ++void blkcg_punt_bio_submit(struct bio *bio) ++{ ++ struct blkcg_gq *blkg = bio->bi_blkg; ++ ++ if (blkg->parent) { ++ spin_lock(&blkg->async_bio_lock); ++ bio_list_add(&blkg->async_bios, bio); ++ spin_unlock(&blkg->async_bio_lock); ++ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); ++ } else { ++ /* never bounce for the root cgroup */ ++ submit_bio(bio); ++ } ++} ++EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); ++ ++static int __init blkcg_punt_bio_init(void) ++{ ++ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", ++ WQ_MEM_RECLAIM | WQ_FREEZABLE | ++ WQ_UNBOUND | WQ_SYSFS, 0); ++ if (!blkcg_punt_bio_wq) ++ return -ENOMEM; ++ return 0; ++} ++subsys_initcall(blkcg_punt_bio_init); ++#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ ++ + /** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio +@@ -269,10 +307,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, + + blkg->q = disk->queue; + INIT_LIST_HEAD(&blkg->q_node); ++ blkg->blkcg = blkcg; ++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); +- blkg->blkcg = blkcg; ++#endif + + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) { +@@ -1688,25 +1728,6 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) + } + EXPORT_SYMBOL_GPL(blkcg_policy_unregister); + +-bool __blkcg_punt_bio_submit(struct bio *bio) +-{ +- struct blkcg_gq *blkg = bio->bi_blkg; +- +- /* consume the flag first */ +- bio->bi_opf &= ~REQ_CGROUP_PUNT; +- +- /* never bounce for the root cgroup */ +- if (!blkg->parent) +- return false; +- +- spin_lock_bh(&blkg->async_bio_lock); +- bio_list_add(&blkg->async_bios, bio); +- spin_unlock_bh(&blkg->async_bio_lock); +- +- queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); +- return true; +-} +- + /* + * Scale the accumulated delay based on how long it has been since we updated + * the delay. We only call this when we are adding delay, in case it's been a +@@ -2085,16 +2106,5 @@ bool blk_cgroup_congested(void) + return ret; + } + +-static int __init blkcg_init(void) +-{ +- blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", +- WQ_MEM_RECLAIM | WQ_FREEZABLE | +- WQ_UNBOUND | WQ_SYSFS, 0); +- if (!blkcg_punt_bio_wq) +- return -ENOMEM; +- return 0; +-} +-subsys_initcall(blkcg_init); +- + module_param(blkcg_debug_stats, bool, 0644); + MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); +diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h +index 9c5078755e5e..e98d2c1be354 100644 +--- a/block/blk-cgroup.h ++++ b/block/blk-cgroup.h +@@ -72,9 +72,10 @@ struct blkcg_gq { + struct blkg_iostat_set iostat; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; +- ++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO + spinlock_t async_bio_lock; + struct bio_list async_bios; ++#endif + union { + struct work_struct async_bio_work; + struct work_struct free_work; +@@ -375,16 +376,6 @@ static inline void blkg_put(struct blkcg_gq *blkg) + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) + +-bool __blkcg_punt_bio_submit(struct bio *bio); +- +-static inline bool blkcg_punt_bio_submit(struct bio *bio) +-{ +- if (bio->bi_opf & REQ_CGROUP_PUNT) +- return __blkcg_punt_bio_submit(bio); +- else +- return false; +-} +- + static inline void blkcg_bio_issue_init(struct bio *bio) + { + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +@@ -506,8 +497,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return + static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } + static inline void blkg_get(struct blkcg_gq *blkg) { } + static inline void blkg_put(struct blkcg_gq *blkg) { } +- +-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; } + static inline void blkcg_bio_issue_init(struct bio *bio) { } + static inline void blk_cgroup_bio_start(struct bio *bio) { } + static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } +diff --git a/block/blk-core.c b/block/blk-core.c +index 42926e6cb83c..478978dcb2bd 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct); + */ + void submit_bio(struct bio *bio) + { +- if (blkcg_punt_bio_submit(bio)) +- return; +- + if (bio_op(bio) == REQ_OP_READ) { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, bio_sectors(bio)); +diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig +index 37b6bab90c83..66fa9ab2c046 100644 +--- a/fs/btrfs/Kconfig ++++ b/fs/btrfs/Kconfig +@@ -2,6 +2,7 @@ + + config BTRFS_FS + tristate "Btrfs filesystem support" ++ select BLK_CGROUP_PUNT_BIO + select CRYPTO + select CRYPTO_CRC32C + select LIBCRC32C +diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c +index 726592868e9c..5379c4714905 100644 +--- a/fs/btrfs/bio.c ++++ b/fs/btrfs/bio.c +@@ -31,11 +31,11 @@ struct btrfs_failed_bio { + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, ++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private) + { + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); +- bbio->inode = inode; ++ bbio->fs_info = fs_info; + bbio->end_io = end_io; + bbio->private = private; + atomic_set(&bbio->pending_ios, 1); +@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, +- struct btrfs_inode *inode, +- btrfs_bio_end_io_t end_io, void *private) ++struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, ++ struct btrfs_fs_info *fs_info, ++ btrfs_bio_end_io_t end_io, void *private) + { ++ struct btrfs_bio *bbio; + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); +- btrfs_bio_init(btrfs_bio(bio), inode, end_io, private); +- return bio; ++ bbio = btrfs_bio(bio); ++ btrfs_bio_init(bbio, fs_info, end_io, private); ++ return bbio; + } + +-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, +- struct bio *orig, u64 map_length, +- bool use_append) ++static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio) + { +- struct btrfs_bio *orig_bbio = btrfs_bio(orig); ++ struct btrfs_ordered_extent *ordered; ++ int ret; ++ ++ ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset); ++ if (WARN_ON_ONCE(!ordered)) ++ return BLK_STS_IOERR; ++ ret = btrfs_extract_ordered_extent(bbio, ordered); ++ btrfs_put_ordered_extent(ordered); ++ ++ return errno_to_blk_status(ret); ++} ++ ++static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, ++ struct btrfs_bio *orig_bbio, ++ u64 map_length, bool use_append) ++{ ++ struct btrfs_bio *bbio; + struct bio *bio; + + if (use_append) { + unsigned int nr_segs; + +- bio = bio_split_rw(orig, &fs_info->limits, &nr_segs, ++ bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs, + &btrfs_clone_bioset, map_length); + } else { +- bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS, +- &btrfs_clone_bioset); ++ bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT, ++ GFP_NOFS, &btrfs_clone_bioset); + } +- btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio); +- +- btrfs_bio(bio)->file_offset = orig_bbio->file_offset; +- if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED)) ++ bbio = btrfs_bio(bio); ++ btrfs_bio_init(bbio, fs_info, NULL, orig_bbio); ++ bbio->inode = orig_bbio->inode; ++ bbio->file_offset = orig_bbio->file_offset; ++ if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED)) + orig_bbio->file_offset += map_length; + + atomic_inc(&orig_bbio->pending_ios); +- return bio; ++ return bbio; + } + + static void btrfs_orig_write_end_io(struct bio *bio); +@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, + goto done; + } + +- btrfs_submit_bio(&repair_bbio->bio, mirror); ++ btrfs_submit_bio(repair_bbio, mirror); + return; + } + +@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio, + repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS, + &btrfs_repair_bioset); + repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector; +- bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); ++ __bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset); + + repair_bbio = btrfs_bio(repair_bio); +- btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio); ++ btrfs_bio_init(repair_bbio, fs_info, NULL, fbio); ++ repair_bbio->inode = failed_bbio->inode; + repair_bbio->file_offset = failed_bbio->file_offset + bio_offset; + + mirror = next_repair_mirror(fbio, failed_bbio->mirror_num); + btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror); +- btrfs_submit_bio(repair_bio, mirror); ++ btrfs_submit_bio(repair_bbio, mirror); + return fbio; + } + +@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de + struct btrfs_failed_bio *fbio = NULL; + u32 offset = 0; + ++ /* Read-repair requires the inode field to be set by the submitter. */ ++ ASSERT(inode); ++ + /* + * Hand off repair bios to the repair code as there is no upper level + * submitter for them. +@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work) + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + /* Metadata reads are checked and repaired by the submitter. */ +- if (bbio->bio.bi_opf & REQ_META) +- bbio->end_io(bbio); +- else ++ if (bbio->inode && !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, bbio->bio.bi_private); ++ else ++ bbio->end_io(bbio); + } + + static void btrfs_simple_end_io(struct bio *bio) + { + struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_device *dev = bio->bi_private; +- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ struct btrfs_fs_info *fs_info = bbio->fs_info; + + btrfs_bio_counter_dec(fs_info); + +@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio) + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; +- if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META)) ++ if (bio_op(bio) == REQ_OP_READ && bbio->inode && ++ !(bbio->bio.bi_opf & REQ_META)) + btrfs_check_read_bio(bbio, NULL); + else + btrfs_orig_bbio_end_io(bbio); +@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); +- submit_bio(bio); ++ ++ if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) ++ blkcg_punt_bio_submit(bio); ++ else ++ submit_bio(bio); + } + + static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work) + + /* + * All of the bios that pass through here are from async helpers. +- * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context. +- * This changes nothing when cgroups aren't in use. ++ * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's ++ * context. This changes nothing when cgroups aren't in use. + */ +- bio->bi_opf |= REQ_CGROUP_PUNT; ++ bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT; + __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); + } + +@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio) + * in order. + */ + if (bbio->bio.bi_opf & REQ_META) { +- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ struct btrfs_fs_info *fs_info = bbio->fs_info; + + if (btrfs_is_zoned(fs_info)) + return false; +@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, + struct btrfs_io_stripe *smap, int mirror_num) + { +- struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; ++ struct btrfs_fs_info *fs_info = bbio->fs_info; + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); +@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, + return true; + } + +-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) ++static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) + { +- struct btrfs_bio *bbio = btrfs_bio(bio); + struct btrfs_inode *inode = bbio->inode; +- struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_bio *orig_bbio = bbio; ++ struct bio *bio = &bbio->bio; + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; +@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) + map_length = min(map_length, fs_info->max_zone_append_size); + + if (map_length < length) { +- bio = btrfs_split_bio(fs_info, bio, map_length, use_append); +- bbio = btrfs_bio(bio); ++ bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append); ++ bio = &bbio->bio; + } + + /* + * Save the iter for the end_io handler and preload the checksums for + * data reads. + */ +- if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) { ++ if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) { + bbio->saved_iter = bio->bi_iter; + ret = btrfs_lookup_bio_sums(bbio); + if (ret) +@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) + if (use_append) { + bio->bi_opf &= ~REQ_OP_WRITE; + bio->bi_opf |= REQ_OP_ZONE_APPEND; +- ret = btrfs_extract_ordered_extent(btrfs_bio(bio)); ++ ret = btrfs_bio_extract_ordered_extent(bbio); + if (ret) + goto fail_put_bio; + } +@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) + * Csum items for reloc roots have already been cloned at this + * point, so they are handled as part of the no-checksum case. + */ +- if (!(inode->flags & BTRFS_INODE_NODATASUM) && ++ if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(inode->root)) { + if (should_async_write(bbio) && +@@ -686,9 +712,12 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num) + return true; + } + +-void btrfs_submit_bio(struct bio *bio, int mirror_num) ++void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) + { +- while (!btrfs_submit_chunk(bio, mirror_num)) ++ /* If bbio->inode is not populated, its file_offset must be 0. */ ++ ASSERT(bbio->inode || bbio->file_offset == 0); ++ ++ while (!btrfs_submit_chunk(bbio, mirror_num)) + ; + } + +@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) + { +- struct btrfs_device *dev; ++ struct btrfs_io_stripe smap = { 0 }; + struct bio_vec bvec; + struct bio bio; +- u64 map_length = 0; +- u64 sector; +- struct btrfs_io_context *bioc = NULL; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); +@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + +- map_length = length; +- + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); +- if (btrfs_is_parity_mirror(fs_info, logical, length)) { +- /* +- * Note that we don't use BTRFS_MAP_WRITE because it's supposed +- * to update all raid stripes, but here we just want to correct +- * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad +- * stripe's dev and sector. +- */ +- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, +- &map_length, &bioc, 0); +- if (ret) +- goto out_counter_dec; +- ASSERT(bioc->mirror_num == 1); +- } else { +- ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, +- &map_length, &bioc, mirror_num); +- if (ret) +- goto out_counter_dec; +- /* +- * This happens when dev-replace is also running, and the +- * mirror_num indicates the dev-replace target. +- * +- * In this case, we don't need to do anything, as the read +- * error just means the replace progress hasn't reached our +- * read range, and later replace routine would handle it well. +- */ +- if (mirror_num != bioc->mirror_num) +- goto out_counter_dec; +- } +- +- sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; +- dev = bioc->stripes[bioc->mirror_num - 1].dev; +- btrfs_put_bioc(bioc); ++ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); ++ if (ret < 0) ++ goto out_counter_dec; + +- if (!dev || !dev->bdev || +- !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { ++ if (!smap.dev->bdev || ++ !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + +- bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); +- bio.bi_iter.bi_sector = sector; ++ bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); ++ bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ +- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); ++ btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", +- ino, start, btrfs_dev_name(dev), sector); ++ ino, start, btrfs_dev_name(smap.dev), ++ smap.physical >> SECTOR_SHIFT); + ret = 0; + + out_bio_uninit: +@@ -791,6 +787,45 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + return ret; + } + ++/* ++ * Submit a btrfs_bio based repair write. ++ * ++ * If @dev_replace is true, the write would be submitted to dev-replace target. ++ */ ++void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace) ++{ ++ struct btrfs_fs_info *fs_info = bbio->fs_info; ++ u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; ++ u64 length = bbio->bio.bi_iter.bi_size; ++ struct btrfs_io_stripe smap = { 0 }; ++ int ret; ++ ++ ASSERT(fs_info); ++ ASSERT(mirror_num > 0); ++ ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE); ++ ASSERT(!bbio->inode); ++ ++ btrfs_bio_counter_inc_blocked(fs_info); ++ ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num); ++ if (ret < 0) ++ goto fail; ++ ++ if (dev_replace) { ++ if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) { ++ bbio->bio.bi_opf &= ~REQ_OP_WRITE; ++ bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND; ++ } ++ ASSERT(smap.dev == fs_info->dev_replace.srcdev); ++ smap.dev = fs_info->dev_replace.tgtdev; ++ } ++ __btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num); ++ return; ++ ++fail: ++ btrfs_bio_counter_dec(fs_info); ++ btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); ++} ++ + int __init btrfs_bioset_init(void) + { + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, +diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h +index 873ff85817f0..a8eca3a65673 100644 +--- a/fs/btrfs/bio.h ++++ b/fs/btrfs/bio.h +@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + * passed to btrfs_submit_bio for mapping to the physical devices. + */ + struct btrfs_bio { +- /* Inode and offset into it that this I/O operates on. */ ++ /* ++ * Inode and offset into it that this I/O operates on. ++ * Only set for data I/O. ++ */ + struct btrfs_inode *inode; + u64 file_offset; + +@@ -58,6 +61,9 @@ struct btrfs_bio { + atomic_t pending_ios; + struct work_struct end_io_work; + ++ /* File system that this I/O operates on. */ ++ struct btrfs_fs_info *fs_info; ++ + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. +@@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio) + int __init btrfs_bioset_init(void); + void __cold btrfs_bioset_exit(void); + +-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode, ++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info, + btrfs_bio_end_io_t end_io, void *private); +-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, +- struct btrfs_inode *inode, +- btrfs_bio_end_io_t end_io, void *private); ++struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, ++ struct btrfs_fs_info *fs_info, ++ btrfs_bio_end_io_t end_io, void *private); + + static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) + { +@@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) + /* Bio only refers to one ordered extent. */ + #define REQ_BTRFS_ONE_ORDERED REQ_DRV + +-void btrfs_submit_bio(struct bio *bio, int mirror_num); ++/* Submit using blkcg_punt_bio_submit. */ ++#define REQ_BTRFS_CGROUP_PUNT REQ_FS_PRIVATE ++ ++void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); ++void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); + int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); +diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c +index 5fc670c27f86..957ad1c31c4f 100644 +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) + btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, + cache); + +- /* +- * If not empty, someone is still holding mutex of +- * full_stripe_lock, which can only be released by caller. +- * And it will definitely cause use-after-free when caller +- * tries to release full stripe lock. +- * +- * No better way to resolve, but only to warn. +- */ +- WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); + kfree(cache->free_space_ctl); + kfree(cache->physical_map); + kfree(cache); +@@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + + map = em->map_lookup; + data_stripe_length = em->orig_block_len; +- io_stripe_size = map->stripe_len; ++ io_stripe_size = BTRFS_STRIPE_LEN; + chunk_start = em->start; + + /* For RAID5/6 adjust to a full IO stripe length */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) +- io_stripe_size = map->stripe_len * nr_data_stripes(map); ++ io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + if (!buf) { +@@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + + for (i = 0; i < map->num_stripes; i++) { + bool already_inserted = false; +- u64 stripe_nr; +- u64 offset; ++ u32 stripe_nr; ++ u32 offset; + int j; + + if (!in_range(physical, map->stripes[i].physical, + data_stripe_length)) + continue; + +- stripe_nr = physical - map->stripes[i].physical; +- stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); ++ stripe_nr = (physical - map->stripes[i].physical) >> ++ BTRFS_STRIPE_LEN_SHIFT; ++ offset = (physical - map->stripes[i].physical) & ++ BTRFS_STRIPE_LEN_MASK; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | +- BTRFS_BLOCK_GROUP_RAID10)) { +- stripe_nr = stripe_nr * map->num_stripes + i; +- stripe_nr = div_u64(stripe_nr, map->sub_stripes); +- } ++ BTRFS_BLOCK_GROUP_RAID10)) ++ stripe_nr = div_u64(stripe_nr * map->num_stripes + i, ++ map->sub_stripes); + /* + * The remaining case would be for RAID56, multiply by + * nr_data_stripes(). Alternatively, just use rmap_len below + * instead of map->stripe_len + */ +- + bytenr = chunk_start + stripe_nr * io_stripe_size + offset; + + /* Ensure we don't add duplicate addresses */ +@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( + btrfs_init_free_space_ctl(cache, cache->free_space_ctl); + atomic_set(&cache->frozen, 0); + mutex_init(&cache->free_space_lock); +- cache->full_stripe_locks_root.root = RB_ROOT; +- mutex_init(&cache->full_stripe_locks_root.lock); + + return cache; + } +@@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) + } + + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, +- u64 bytes_used, u64 type, ++ u64 type, + u64 chunk_offset, u64 size) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +@@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran + + cache->length = size; + set_free_space_tree_thresholds(cache); +- cache->used = bytes_used; + cache->flags = type; + cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); +@@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran + + #ifdef CONFIG_BTRFS_DEBUG + if (btrfs_should_fragment_free_space(cache)) { +- u64 new_bytes_used = size - bytes_used; +- +- cache->space_info->bytes_used += new_bytes_used >> 1; ++ cache->space_info->bytes_used += size >> 1; + fragment_free_space(cache); + } + #endif +diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h +index 6e4a0b429ac3..cc0e4b37db2d 100644 +--- a/fs/btrfs/block-group.h ++++ b/fs/btrfs/block-group.h +@@ -91,14 +91,6 @@ struct btrfs_caching_control { + /* Once caching_thread() finds this much free space, it will wake up waiters. */ + #define CACHING_CTL_WAKE_UP SZ_2M + +-/* +- * Tree to record all locked full stripes of a RAID5/6 block group +- */ +-struct btrfs_full_stripe_locks_tree { +- struct rb_root root; +- struct mutex lock; +-}; +- + struct btrfs_block_group { + struct btrfs_fs_info *fs_info; + struct inode *inode; +@@ -229,9 +221,6 @@ struct btrfs_block_group { + */ + int swap_extents; + +- /* Record locked full stripes for RAID5/6 block group */ +- struct btrfs_full_stripe_locks_tree full_stripe_locks_root; +- + /* + * Allocation offset for the block group to implement sequential + * allocation. This is used only on a zoned filesystem. +@@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); + void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); + int btrfs_read_block_groups(struct btrfs_fs_info *info); + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, +- u64 bytes_used, u64 type, ++ u64 type, + u64 chunk_offset, u64 size); + void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); + int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, +diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c +index 5367a14d44d2..3ab707e26fa2 100644 +--- a/fs/btrfs/block-rsv.c ++++ b/fs/btrfs/block-rsv.c +@@ -232,9 +232,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) + u64 num_bytes = 0; + int ret = -ENOSPC; + +- if (!block_rsv) +- return 0; +- + spin_lock(&block_rsv->lock); + num_bytes = mult_perc(block_rsv->size, min_percent); + if (block_rsv->reserved >= num_bytes) +@@ -245,17 +242,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) + } + + int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, +- struct btrfs_block_rsv *block_rsv, u64 min_reserved, ++ struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush) + { +- u64 num_bytes = 0; + int ret = -ENOSPC; + + if (!block_rsv) + return 0; + + spin_lock(&block_rsv->lock); +- num_bytes = min_reserved; + if (block_rsv->reserved >= num_bytes) + ret = 0; + else +@@ -355,17 +350,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) + + /* + * But we also want to reserve enough space so we can do the fallback +- * global reserve for an unlink, which is an additional 5 items (see the +- * comment in __unlink_start_trans for what we're modifying.) ++ * global reserve for an unlink, which is an additional ++ * BTRFS_UNLINK_METADATA_UNITS items. + * + * But we also need space for the delayed ref updates from the unlink, +- * so its 10, 5 for the actual operation, and 5 for the delayed ref +- * updates. ++ * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for ++ * each unlink metadata item. + */ +- min_items += 10; ++ min_items += BTRFS_UNLINK_METADATA_UNITS; + + num_bytes = max_t(u64, num_bytes, +- btrfs_calc_insert_metadata_size(fs_info, min_items)); ++ btrfs_calc_insert_metadata_size(fs_info, min_items) + ++ btrfs_calc_delayed_ref_bytes(fs_info, ++ BTRFS_UNLINK_METADATA_UNITS)); + + spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); +diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h +index 4cc41c9aaa82..6dc781709aca 100644 +--- a/fs/btrfs/block-rsv.h ++++ b/fs/btrfs/block-rsv.h +@@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush); + int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); + int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, +- struct btrfs_block_rsv *block_rsv, u64 min_reserved, ++ struct btrfs_block_rsv *block_rsv, u64 num_bytes, + enum btrfs_reserve_flush_enum flush); + int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, + struct btrfs_block_rsv *dst_rsv, u64 num_bytes, +diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h +index 9dc21622806e..ec2ae4406c16 100644 +--- a/fs/btrfs/btrfs_inode.h ++++ b/fs/btrfs/btrfs_inode.h +@@ -142,11 +142,22 @@ struct btrfs_inode { + /* a local copy of root's last_log_commit */ + int last_log_commit; + +- /* +- * Total number of bytes pending delalloc, used by stat to calculate the +- * real block usage of the file. This is used only for files. +- */ +- u64 delalloc_bytes; ++ union { ++ /* ++ * Total number of bytes pending delalloc, used by stat to ++ * calculate the real block usage of the file. This is used ++ * only for files. ++ */ ++ u64 delalloc_bytes; ++ /* ++ * The lowest possible index of the next dir index key which ++ * points to an inode that needs to be logged. ++ * This is used only for directories. ++ * Use the helpers btrfs_get_first_dir_index_to_log() and ++ * btrfs_set_first_dir_index_to_log() to access this field. ++ */ ++ u64 first_dir_index_to_log; ++ }; + + union { + /* +@@ -247,6 +258,17 @@ struct btrfs_inode { + struct inode vfs_inode; + }; + ++static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode) ++{ ++ return READ_ONCE(inode->first_dir_index_to_log); ++} ++ ++static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode, ++ u64 index) ++{ ++ WRITE_ONCE(inode->first_dir_index_to_log, index); ++} ++ + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) + { + return container_of(inode, struct btrfs_inode, vfs_inode); +@@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, + + int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio); ++int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, ++ struct btrfs_ordered_extent *ordered); + bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + u32 bio_offset, struct bio_vec *bv); + noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c +index f42f31f22d13..2d0493f0a184 100644 +--- a/fs/btrfs/compression.c ++++ b/fs/btrfs/compression.c +@@ -37,6 +37,8 @@ + #include "file-item.h" + #include "super.h" + ++struct bio_set btrfs_compressed_bioset; ++ + static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; + + const char* btrfs_compress_type2str(enum btrfs_compression_type type) +@@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type) + return NULL; + } + ++static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio) ++{ ++ return container_of(bbio, struct compressed_bio, bbio); ++} ++ ++static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode, ++ u64 start, blk_opf_t op, ++ btrfs_bio_end_io_t end_io) ++{ ++ struct btrfs_bio *bbio; ++ ++ bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op, ++ GFP_NOFS, &btrfs_compressed_bioset)); ++ btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL); ++ bbio->inode = inode; ++ bbio->file_offset = start; ++ return to_compressed_bio(bbio); ++} ++ + bool btrfs_compress_is_valid_type(const char *str, size_t len) + { + int i; +@@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws, + } + } + ++static void btrfs_free_compressed_pages(struct compressed_bio *cb) ++{ ++ for (unsigned int i = 0; i < cb->nr_pages; i++) ++ put_page(cb->compressed_pages[i]); ++ kfree(cb->compressed_pages); ++} ++ + static int btrfs_decompress_bio(struct compressed_bio *cb); + + static void end_compressed_bio_read(struct btrfs_bio *bbio) + { +- struct compressed_bio *cb = bbio->private; +- unsigned int index; +- struct page *page; ++ struct compressed_bio *cb = to_compressed_bio(bbio); ++ blk_status_t status = bbio->bio.bi_status; + +- if (bbio->bio.bi_status) +- cb->status = bbio->bio.bi_status; +- else +- cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); +- +- /* Release the compressed pages */ +- for (index = 0; index < cb->nr_pages; index++) { +- page = cb->compressed_pages[index]; +- page->mapping = NULL; +- put_page(page); +- } +- +- /* Do io completion on the original bio */ +- btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status); ++ if (!status) ++ status = errno_to_blk_status(btrfs_decompress_bio(cb)); + +- /* Finally free the cb struct */ +- kfree(cb->compressed_pages); +- kfree(cb); ++ btrfs_free_compressed_pages(cb); ++ btrfs_bio_end_io(cb->orig_bbio, status); + bio_put(&bbio->bio); + } + +@@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +-static noinline void end_compressed_writeback(struct inode *inode, +- const struct compressed_bio *cb) ++static noinline void end_compressed_writeback(const struct compressed_bio *cb) + { ++ struct inode *inode = &cb->bbio.inode->vfs_inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long index = cb->start >> PAGE_SHIFT; + unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; + struct folio_batch fbatch; +- const int errno = blk_status_to_errno(cb->status); ++ const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + int i; + int ret; + +@@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode, + /* the inode may be gone now */ + } + +-static void finish_compressed_bio_write(struct compressed_bio *cb) ++static void btrfs_finish_compressed_write_work(struct work_struct *work) + { +- struct inode *inode = cb->inode; +- unsigned int index; ++ struct compressed_bio *cb = ++ container_of(work, struct compressed_bio, write_end_work); + + /* + * Ok, we're the last bio for this extent, step one is to call back + * into the FS and do all the end_io operations. + */ +- btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, ++ btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL, + cb->start, cb->start + cb->len - 1, +- cb->status == BLK_STS_OK); ++ cb->bbio.bio.bi_status == BLK_STS_OK); + + if (cb->writeback) +- end_compressed_writeback(inode, cb); ++ end_compressed_writeback(cb); + /* Note, our inode could be gone now */ + +- /* +- * Release the compressed pages, these came from alloc_page and +- * are not attached to the inode at all +- */ +- for (index = 0; index < cb->nr_pages; index++) { +- struct page *page = cb->compressed_pages[index]; +- +- page->mapping = NULL; +- put_page(page); +- } +- +- /* Finally free the cb struct */ +- kfree(cb->compressed_pages); +- kfree(cb); +-} +- +-static void btrfs_finish_compressed_write_work(struct work_struct *work) +-{ +- struct compressed_bio *cb = +- container_of(work, struct compressed_bio, write_end_work); +- +- finish_compressed_bio_write(cb); ++ btrfs_free_compressed_pages(cb); ++ bio_put(&cb->bbio.bio); + } + + /* +@@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) + */ + static void end_compressed_bio_write(struct btrfs_bio *bbio) + { +- struct compressed_bio *cb = bbio->private; +- struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); ++ struct compressed_bio *cb = to_compressed_bio(bbio); ++ struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; + +- cb->status = bbio->bio.bi_status; + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); ++} + +- bio_put(&bbio->bio); ++static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb) ++{ ++ struct bio *bio = &cb->bbio.bio; ++ u32 offset = 0; ++ ++ while (offset < cb->compressed_len) { ++ u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE); ++ ++ /* Maximum compressed extent is smaller than bio size limit. */ ++ __bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT], ++ len, 0); ++ offset += len; ++ } + } + + /* +@@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio) + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, ++void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + unsigned int len, u64 disk_start, + unsigned int compressed_len, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, +- struct cgroup_subsys_state *blkcg_css, + bool writeback) + { + struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct bio *bio = NULL; + struct compressed_bio *cb; +- u64 cur_disk_bytenr = disk_start; +- blk_status_t ret = BLK_STS_OK; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); +- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); +- if (!cb) +- return BLK_STS_RESOURCE; +- cb->status = BLK_STS_OK; +- cb->inode = &inode->vfs_inode; ++ ++ write_flags |= REQ_BTRFS_ONE_ORDERED; ++ ++ cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags, ++ end_compressed_bio_write); + cb->start = start; + cb->len = len; + cb->compressed_pages = compressed_pages; +@@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + cb->writeback = writeback; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); + cb->nr_pages = nr_pages; ++ cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; ++ btrfs_add_compressed_bio_pages(cb); + +- if (blkcg_css) { +- kthread_associate_blkcg(blkcg_css); +- write_flags |= REQ_CGROUP_PUNT; +- } +- +- write_flags |= REQ_BTRFS_ONE_ORDERED; +- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags, +- BTRFS_I(cb->inode), end_compressed_bio_write, cb); +- bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT; +- btrfs_bio(bio)->file_offset = start; +- +- while (cur_disk_bytenr < disk_start + compressed_len) { +- u64 offset = cur_disk_bytenr - disk_start; +- unsigned int index = offset >> PAGE_SHIFT; +- unsigned int real_size; +- unsigned int added; +- struct page *page = compressed_pages[index]; +- +- /* +- * We have various limits on the real read size: +- * - page boundary +- * - compressed length boundary +- */ +- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); +- real_size = min_t(u64, real_size, compressed_len - offset); +- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); +- +- added = bio_add_page(bio, page, real_size, offset_in_page(offset)); +- /* +- * Maximum compressed extent is smaller than bio size limit, +- * thus bio_add_page() should always success. +- */ +- ASSERT(added == real_size); +- cur_disk_bytenr += added; +- } +- +- /* Finished the range. */ +- ASSERT(bio->bi_iter.bi_size); +- btrfs_submit_bio(bio, 0); +- if (blkcg_css) +- kthread_associate_blkcg(NULL); +- return ret; +-} +- +-static u64 bio_end_offset(struct bio *bio) +-{ +- struct bio_vec *last = bio_last_bvec_all(bio); +- +- return page_offset(last->bv_page) + last->bv_len + last->bv_offset; ++ btrfs_submit_bio(&cb->bbio, 0); + } + + /* +@@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, + { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long end_index; +- u64 cur = bio_end_offset(cb->orig_bio); ++ struct bio *orig_bio = &cb->orig_bbio->bio; ++ u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; + u64 isize = i_size_read(inode); + int ret; + struct page *page; +@@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, + */ + if (!em || cur < em->start || + (cur + fs_info->sectorsize > extent_map_end(em)) || +- (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { ++ (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) { + free_extent_map(em); + unlock_extent(tree, cur, page_end, NULL); + unlock_page(page); +@@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, + } + + add_size = min(em->start + em->len, page_end + 1) - cur; +- ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); ++ ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur)); + if (ret != add_size) { + unlock_extent(tree, cur, page_end, NULL); + unlock_page(page); +@@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode, + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +- int mirror_num) ++void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num) + { +- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- struct extent_map_tree *em_tree; ++ struct btrfs_inode *inode = bbio->inode; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct extent_map_tree *em_tree = &inode->extent_tree; + struct compressed_bio *cb; + unsigned int compressed_len; +- struct bio *comp_bio; +- const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; +- u64 cur_disk_byte = disk_bytenr; +- u64 file_offset; ++ u64 file_offset = bbio->file_offset; + u64 em_len; + u64 em_start; + struct extent_map *em; +@@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int memstall = 0; + blk_status_t ret; + int ret2; +- int i; +- +- em_tree = &BTRFS_I(inode)->extent_tree; +- +- file_offset = bio_first_bvec_all(bio)->bv_offset + +- page_offset(bio_first_page_all(bio)); + + /* we need the actual starting offset of this extent in the file */ + read_lock(&em_tree->lock); +@@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + + ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); + compressed_len = em->block_len; +- cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); +- if (!cb) { +- ret = BLK_STS_RESOURCE; +- goto out; +- } + +- cb->status = BLK_STS_OK; +- cb->inode = inode; ++ cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, ++ end_compressed_bio_read); + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + +- cb->len = bio->bi_iter.bi_size; ++ cb->len = bbio->bio.bi_iter.bi_size; + cb->compressed_len = compressed_len; + cb->compress_type = em->compress_type; +- cb->orig_bio = bio; ++ cb->orig_bbio = bbio; + + free_extent_map(em); +- em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; +- goto fail; ++ goto out_free_bio; + } + + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; +- goto fail; ++ goto out_free_compressed_pages; + } + +- add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags); ++ add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, ++ &pflags); + + /* include any pages we added in add_ra-bio_pages */ +- cb->len = bio->bi_iter.bi_size; +- +- comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode), +- end_compressed_bio_read, cb); +- comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT); +- +- while (cur_disk_byte < disk_bytenr + compressed_len) { +- u64 offset = cur_disk_byte - disk_bytenr; +- unsigned int index = offset >> PAGE_SHIFT; +- unsigned int real_size; +- unsigned int added; +- struct page *page = cb->compressed_pages[index]; +- +- /* +- * We have various limit on the real read size: +- * - page boundary +- * - compressed length boundary +- */ +- real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset)); +- real_size = min_t(u64, real_size, compressed_len - offset); +- ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); +- +- added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); +- /* +- * Maximum compressed extent is smaller than bio size limit, +- * thus bio_add_page() should always success. +- */ +- ASSERT(added == real_size); +- cur_disk_byte += added; +- } ++ cb->len = bbio->bio.bi_iter.bi_size; ++ cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; ++ btrfs_add_compressed_bio_pages(cb); + + if (memstall) + psi_memstall_leave(&pflags); + +- /* +- * Stash the initial offset of this chunk, as there is no direct +- * correlation between compressed pages and the original file offset. +- * The field is only used for printing error messages anyway. +- */ +- btrfs_bio(comp_bio)->file_offset = file_offset; +- +- ASSERT(comp_bio->bi_iter.bi_size); +- btrfs_submit_bio(comp_bio, mirror_num); ++ btrfs_submit_bio(&cb->bbio, mirror_num); + return; + +-fail: +- if (cb->compressed_pages) { +- for (i = 0; i < cb->nr_pages; i++) { +- if (cb->compressed_pages[i]) +- __free_page(cb->compressed_pages[i]); +- } +- } +- ++out_free_compressed_pages: + kfree(cb->compressed_pages); +- kfree(cb); ++out_free_bio: ++ bio_put(&cb->bbio.bio); + out: +- free_extent_map(em); +- btrfs_bio_end_io(btrfs_bio(bio), ret); +- return; ++ btrfs_bio_end_io(bbio, ret); + } + + /* +@@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) + ret = compression_decompress_bio(workspace, cb); + put_workspace(type, workspace); + ++ if (!ret) ++ zero_fill_bio(&cb->orig_bbio->bio); + return ret; + } + +@@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, + + int __init btrfs_init_compress(void) + { ++ if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE, ++ offsetof(struct compressed_bio, bbio.bio), ++ BIOSET_NEED_BVECS)) ++ return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); + btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); +@@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void) + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); + zstd_cleanup_workspace_manager(); ++ bioset_exit(&btrfs_compressed_bioset); + } + + /* +@@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void) + int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed) + { +- struct bio *orig_bio = cb->orig_bio; ++ struct bio *orig_bio = &cb->orig_bbio->bio; + /* Offset inside the full decompressed extent */ + u32 cur_offset; + +diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h +index a5e3377db9ad..19ab2abeddc0 100644 +--- a/fs/btrfs/compression.h ++++ b/fs/btrfs/compression.h +@@ -6,8 +6,8 @@ + #ifndef BTRFS_COMPRESSION_H + #define BTRFS_COMPRESSION_H + +-#include + #include ++#include "bio.h" + + struct btrfs_inode; + +@@ -23,6 +23,7 @@ struct btrfs_inode; + + /* Maximum length of compressed data stored on disk */ + #define BTRFS_MAX_COMPRESSED (SZ_128K) ++#define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) + static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + + /* Maximum size of data before compression */ +@@ -37,9 +38,6 @@ struct compressed_bio { + /* the pages with the compressed data on them */ + struct page **compressed_pages; + +- /* inode that owns this data */ +- struct inode *inode; +- + /* starting offset in the inode for our pages */ + u64 start; + +@@ -55,14 +53,14 @@ struct compressed_bio { + /* Whether this is a write for writeback. */ + bool writeback; + +- /* IO errors */ +- blk_status_t status; +- + union { + /* For reads, this is the bio we are copying the data into */ +- struct bio *orig_bio; ++ struct btrfs_bio *orig_bbio; + struct work_struct write_end_work; + }; ++ ++ /* Must be last. */ ++ struct btrfs_bio bbio; + }; + + static inline unsigned int btrfs_compress_type(unsigned int type_level) +@@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, + int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed); + +-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, ++void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, + unsigned int len, u64 disk_start, + unsigned int compressed_len, + struct page **compressed_pages, + unsigned int nr_pages, + blk_opf_t write_flags, +- struct cgroup_subsys_state *blkcg_css, + bool writeback); +-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, +- int mirror_num); ++void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num); + + unsigned int btrfs_compress_str2level(unsigned int type, const char *str); + +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index a5b6bb54545f..3c983c70028a 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, + * Search for a key in the given extent_buffer. + * + * The lower boundary for the search is specified by the slot number @first_slot. +- * Use a value of 0 to search over the whole extent buffer. ++ * Use a value of 0 to search over the whole extent buffer. Works for both ++ * leaves and nodes. + * + * The slot in the extent buffer is returned via @slot. If the key exists in the + * extent buffer, then @slot will point to the slot where the key is, otherwise +@@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, + * Slot may point to the total number of items (i.e. one position beyond the last + * key) if the key is bigger than the last key in the extent buffer. + */ +-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, +- const struct btrfs_key *key, int *slot) ++int btrfs_bin_search(struct extent_buffer *eb, int first_slot, ++ const struct btrfs_key *key, int *slot) + { + unsigned long p; + int item_size; +@@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, + if (slot < 0 || slot >= btrfs_header_nritems(parent)) + return ERR_PTR(-ENOENT); + +- BUG_ON(level == 0); ++ ASSERT(level); + + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); +@@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) + return 0; + +- left = btrfs_read_node_slot(parent, pslot - 1); +- if (IS_ERR(left)) +- left = NULL; ++ if (pslot) { ++ left = btrfs_read_node_slot(parent, pslot - 1); ++ if (IS_ERR(left)) { ++ ret = PTR_ERR(left); ++ left = NULL; ++ goto enospc; ++ } + +- if (left) { + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, +@@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, + } + } + +- right = btrfs_read_node_slot(parent, pslot + 1); +- if (IS_ERR(right)) +- right = NULL; ++ if (pslot + 1 < btrfs_header_nritems(parent)) { ++ right = btrfs_read_node_slot(parent, pslot + 1); ++ if (IS_ERR(right)) { ++ ret = PTR_ERR(right); ++ right = NULL; ++ goto enospc; ++ } + +- if (right) { + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, +@@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + if (!parent) + return 1; + +- left = btrfs_read_node_slot(parent, pslot - 1); +- if (IS_ERR(left)) +- left = NULL; +- + /* first, try to make some room in the middle buffer */ +- if (left) { ++ if (pslot) { + u32 left_nr; + ++ left = btrfs_read_node_slot(parent, pslot - 1); ++ if (IS_ERR(left)) ++ return PTR_ERR(left); ++ + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + + left_nr = btrfs_header_nritems(left); +@@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + btrfs_tree_unlock(left); + free_extent_buffer(left); + } +- right = btrfs_read_node_slot(parent, pslot + 1); +- if (IS_ERR(right)) +- right = NULL; + + /* + * then try to empty the right most buffer into the middle + */ +- if (right) { ++ if (pslot + 1 < btrfs_header_nritems(parent)) { + u32 right_nr; + ++ right = btrfs_read_node_slot(parent, pslot + 1); ++ if (IS_ERR(right)) ++ return PTR_ERR(right); ++ + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + + right_nr = btrfs_header_nritems(right); +@@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb, + return 0; + } + +- return btrfs_generic_bin_search(eb, search_low_slot, key, slot); ++ return btrfs_bin_search(eb, search_low_slot, key, slot); + } + + static int search_leaf(struct btrfs_trans_handle *trans, +@@ -2321,7 +2329,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, + */ + btrfs_unlock_up_safe(p, level + 1); + +- ret = btrfs_bin_search(b, key, &slot); ++ ret = btrfs_bin_search(b, 0, key, &slot); + if (ret < 0) + goto done; + +@@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) + { +- while (1) { ++ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + int ret; +- const int slot = path->slots[0]; +- const struct extent_buffer *leaf = path->nodes[0]; + +- /* This is where we start walking the path. */ +- if (slot >= btrfs_header_nritems(leaf)) { +- /* +- * If we've reached the last slot in this leaf we need +- * to go to the next leaf and reset the path. +- */ +- ret = btrfs_next_leaf(root, path); +- if (ret) +- return ret; +- continue; +- } +- /* Store the found, valid item in @key. */ +- btrfs_item_key_to_cpu(leaf, key, slot); +- break; ++ ret = btrfs_next_leaf(root, path); ++ if (ret) ++ return ret; + } ++ ++ btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + return 0; + } + +@@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + btrfs_assert_tree_write_locked(path->nodes[1]); + + right = btrfs_read_node_slot(upper, slot + 1); +- /* +- * slot + 1 is not valid or we fail to read the right node, +- * no big deal, just return. +- */ + if (IS_ERR(right)) +- return 1; ++ return PTR_ERR(right); + + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); + +@@ -3417,12 +3410,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + btrfs_assert_tree_write_locked(path->nodes[1]); + + left = btrfs_read_node_slot(path->nodes[1], slot - 1); +- /* +- * slot - 1 is not valid or we fail to read the left node, +- * no big deal, just return. +- */ + if (IS_ERR(left)) +- return 1; ++ return PTR_ERR(left); + + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); + +@@ -4576,7 +4565,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); +- sret = btrfs_bin_search(cur, min_key, &slot); ++ sret = btrfs_bin_search(cur, 0, min_key, &slot); + if (sret < 0) { + ret = sret; + goto out; +diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h +index 97897107fab5..4c1986cd5bed 100644 +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); + int __init btrfs_ctree_init(void); + void __cold btrfs_ctree_exit(void); + +-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot, +- const struct btrfs_key *key, int *slot); ++int btrfs_bin_search(struct extent_buffer *eb, int first_slot, ++ const struct btrfs_key *key, int *slot); + +-/* +- * Simple binary search on an extent buffer. Works for both leaves and nodes, and +- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1'). +- */ +-static inline int btrfs_bin_search(struct extent_buffer *eb, +- const struct btrfs_key *key, +- int *slot) +-{ +- return btrfs_generic_bin_search(eb, 0, key, slot); +-} +- +-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, +- int *slot); + int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); + int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, +diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c +index 7ddb1d104e8e..427abaf608b8 100644 +--- a/fs/btrfs/delalloc-space.c ++++ b/fs/btrfs/delalloc-space.c +@@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + * racing with an ordered completion or some such that would think it + * needs to free the reservation we just made. + */ +- spin_lock(&inode->lock); + nr_extents = count_max_extents(fs_info, num_bytes); ++ spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, nr_extents); + inode->csum_bytes += disk_num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); +diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c +index 886ffb232eac..0b32432d7d56 100644 +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) + return ret; + } + +-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) +-{ +- u64 num_entries = +- atomic_read(&trans->transaction->delayed_refs.num_entries); +- u64 avg_runtime; +- u64 val; +- +- smp_mb(); +- avg_runtime = trans->fs_info->avg_delayed_ref_runtime; +- val = num_entries * avg_runtime; +- if (val >= NSEC_PER_SEC) +- return 1; +- if (val >= NSEC_PER_SEC / 2) +- return 2; +- +- return btrfs_check_space_for_delayed_refs(trans->fs_info); +-} +- + /* + * Release a ref head's reservation. + * +@@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) + void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) + { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; +- u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr); ++ const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); + u64 released = 0; + +- /* +- * We have to check the mount option here because we could be enabling +- * the free space tree for the first time and don't have the compat_ro +- * option set yet. +- * +- * We need extra reservations if we have the free space tree because +- * we'll have to modify that tree as well. +- */ +- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) +- num_bytes *= 2; +- + released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); + if (released) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", +@@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) + if (!trans->delayed_ref_updates) + return; + +- num_bytes = btrfs_calc_insert_metadata_size(fs_info, +- trans->delayed_ref_updates); +- /* +- * We have to check the mount option here because we could be enabling +- * the free space tree for the first time and don't have the compat_ro +- * option set yet. +- * +- * We need extra reservations if we have the free space tree because +- * we'll have to modify that tree as well. +- */ +- if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) +- num_bytes *= 2; ++ num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, ++ trans->delayed_ref_updates); + + spin_lock(&delayed_rsv->lock); + delayed_rsv->size += num_bytes; +@@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush) + { + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; +- u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1); ++ u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); + u64 num_bytes = 0; + int ret = -ENOSPC; + +@@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + if (ret) + return ret; +- btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0); ++ btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, num_bytes, 1); + return 0; +diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h +index 2eb34abf700f..b54261fe509b 100644 +--- a/fs/btrfs/delayed-ref.h ++++ b/fs/btrfs/delayed-ref.h +@@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep; + int __init btrfs_delayed_ref_init(void); + void __cold btrfs_delayed_ref_exit(void); + ++static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info, ++ int num_delayed_refs) ++{ ++ u64 num_bytes; ++ ++ num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs); ++ ++ /* ++ * We have to check the mount option here because we could be enabling ++ * the free space tree for the first time and don't have the compat_ro ++ * option set yet. ++ * ++ * We need extra reservations if we have the free space tree because ++ * we'll have to modify that tree as well. ++ */ ++ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) ++ num_bytes *= 2; ++ ++ return num_bytes; ++} ++ + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, + int action, u64 bytenr, u64 len, u64 parent) + { +@@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, + void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_block_rsv *src, + u64 num_bytes); +-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans); + bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); + + /* +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index 9e1596bb208d..59ea049fe7ee 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) + { + int ret; +- unsigned int nofs_flag; + +- /* +- * We might be called under a transaction (e.g. indirect backref +- * resolution) which could deadlock if it triggers memory reclaim +- */ +- nofs_flag = memalloc_nofs_save(); +- ret = btrfs_drew_lock_init(&root->snapshot_lock); +- memalloc_nofs_restore(nofs_flag); +- if (ret) +- goto fail; ++ btrfs_drew_lock_init(&root->snapshot_lock); + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + !btrfs_is_data_reloc_root(root)) { +@@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root) + WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)); + if (root->anon_dev) + free_anon_bdev(root->anon_dev); +- btrfs_drew_lock_destroy(&root->snapshot_lock); + free_root_extent_buffers(root); + #ifdef CONFIG_BTRFS_DEBUG + spin_lock(&root->fs_info->fs_roots_radix_lock); +@@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) + atomic_set(&fs_info->reloc_cancel_req, 0); + } + +-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) ++static int btrfs_init_btree_inode(struct super_block *sb) + { +- struct inode *inode = fs_info->btree_inode; ++ struct btrfs_fs_info *fs_info = btrfs_sb(sb); + unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID, + fs_info->tree_root); ++ struct inode *inode; ++ ++ inode = new_inode(sb); ++ if (!inode) ++ return -ENOMEM; + + inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(inode, 1); +@@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) + */ + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &btree_aops; ++ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, +@@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) + BTRFS_I(inode)->location.offset = 0; + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + __insert_inode_hash(inode, hash); ++ fs_info->btree_inode = inode; ++ ++ return 0; + } + + static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +@@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) + atomic64_set(&fs_info->free_chunk_space, 0); + fs_info->tree_mod_log = RB_ROOT; + fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; +- fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ + btrfs_init_ref_verify(fs_info); + + fs_info->thread_pool_size = min_t(unsigned long, +@@ -3344,14 +3342,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + int ret; +- int err = -EINVAL; + int level; + + ret = init_mount_fs_info(fs_info, sb); +- if (ret) { +- err = ret; ++ if (ret) + goto fail; +- } + + /* These need to be init'ed before we start creating inodes and such. */ + tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, +@@ -3361,17 +3356,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + GFP_KERNEL); + fs_info->chunk_root = chunk_root; + if (!tree_root || !chunk_root) { +- err = -ENOMEM; ++ ret = -ENOMEM; + goto fail; + } + +- fs_info->btree_inode = new_inode(sb); +- if (!fs_info->btree_inode) { +- err = -ENOMEM; ++ ret = btrfs_init_btree_inode(sb); ++ if (ret) + goto fail; +- } +- mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); +- btrfs_init_btree_inode(fs_info); + + invalidate_bdev(fs_devices->latest_dev->bdev); + +@@ -3380,7 +3371,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + */ + disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + if (IS_ERR(disk_super)) { +- err = PTR_ERR(disk_super); ++ ret = PTR_ERR(disk_super); + goto fail_alloc; + } + +@@ -3392,7 +3383,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + if (!btrfs_supported_super_csum(csum_type)) { + btrfs_err(fs_info, "unsupported checksum algorithm: %u", + csum_type); +- err = -EINVAL; ++ ret = -EINVAL; + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } +@@ -3401,7 +3392,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + + ret = btrfs_init_csum_hash(fs_info, csum_type); + if (ret) { +- err = ret; + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } +@@ -3412,7 +3402,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + */ + if (btrfs_check_super_csum(fs_info, disk_super)) { + btrfs_err(fs_info, "superblock checksum mismatch"); +- err = -EINVAL; ++ ret = -EINVAL; + btrfs_release_disk_super(disk_super); + goto fail_alloc; + } +@@ -3442,12 +3432,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + ret = btrfs_validate_mount_super(fs_info); + if (ret) { + btrfs_err(fs_info, "superblock contains fatal errors"); +- err = -EINVAL; ++ ret = -EINVAL; + goto fail_alloc; + } + +- if (!btrfs_super_root(disk_super)) ++ if (!btrfs_super_root(disk_super)) { ++ btrfs_err(fs_info, "invalid superblock tree root bytenr"); ++ ret = -EINVAL; + goto fail_alloc; ++ } + + /* check FS state, whether FS is broken. */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) +@@ -3474,16 +3467,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + fs_info->stripesize = stripesize; + + ret = btrfs_parse_options(fs_info, options, sb->s_flags); +- if (ret) { +- err = ret; ++ if (ret) + goto fail_alloc; +- } + + ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); +- if (ret < 0) { +- err = ret; ++ if (ret < 0) + goto fail_alloc; +- } + + if (sectorsize < PAGE_SIZE) { + struct btrfs_subpage_info *subpage_info; +@@ -3503,17 +3492,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + "read-write for sector size %u with page size %lu is experimental", + sectorsize, PAGE_SIZE); + subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); +- if (!subpage_info) ++ if (!subpage_info) { ++ ret = -ENOMEM; + goto fail_alloc; ++ } + btrfs_init_subpage_info(subpage_info, sectorsize); + fs_info->subpage_info = subpage_info; + } + + ret = btrfs_init_workqueues(fs_info); +- if (ret) { +- err = ret; ++ if (ret) + goto fail_sb_buffer; +- } + + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); +@@ -3559,6 +3548,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + btrfs_free_extra_devids(fs_devices); + if (!fs_devices->latest_dev->bdev) { + btrfs_err(fs_info, "failed to read devices"); ++ ret = -EIO; + goto fail_tree_roots; + } + +@@ -3574,8 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + ret = btrfs_get_dev_zone_info_all_devices(fs_info); + if (ret) { + btrfs_err(fs_info, +- "zoned: failed to read device zone info: %d", +- ret); ++ "zoned: failed to read device zone info: %d", ret); + goto fail_block_groups; + } + +@@ -3654,19 +3643,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + !btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "writable mount is not allowed due to too many missing devices"); ++ ret = -EINVAL; + goto fail_sysfs; + } + + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, + "btrfs-cleaner"); +- if (IS_ERR(fs_info->cleaner_kthread)) ++ if (IS_ERR(fs_info->cleaner_kthread)) { ++ ret = PTR_ERR(fs_info->cleaner_kthread); + goto fail_sysfs; ++ } + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); +- if (IS_ERR(fs_info->transaction_kthread)) ++ if (IS_ERR(fs_info->transaction_kthread)) { ++ ret = PTR_ERR(fs_info->transaction_kthread); + goto fail_cleaner; ++ } + + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) { +@@ -3684,7 +3678,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); +- btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } + + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY +@@ -3711,16 +3704,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { + btrfs_info(fs_info, "start tree-log replay"); + ret = btrfs_replay_log(fs_info, fs_devices); +- if (ret) { +- err = ret; ++ if (ret) + goto fail_qgroup; +- } + } + + fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true); + if (IS_ERR(fs_info->fs_root)) { +- err = PTR_ERR(fs_info->fs_root); +- btrfs_warn(fs_info, "failed to read fs tree: %d", err); ++ ret = PTR_ERR(fs_info->fs_root); ++ btrfs_warn(fs_info, "failed to read fs tree: %d", ret); + fs_info->fs_root = NULL; + goto fail_qgroup; + } +@@ -3797,7 +3788,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device + iput(fs_info->btree_inode); + fail: + btrfs_close_devices(fs_info->fs_devices); +- return err; ++ ASSERT(ret < 0); ++ return ret; + } + ALLOW_ERROR_INJECTION(open_ctree, ERRNO); + +@@ -4094,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device) + { + struct bio *bio = &device->flush_bio; + ++ device->last_flush_error = BLK_STS_OK; ++ + #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY + /* + * When a disk has write caching disabled, we skip submission of a bio +@@ -4122,25 +4116,24 @@ static void write_dev_flush(struct btrfs_device *device) + + /* + * If the flush bio has been submitted by write_dev_flush, wait for it. ++ * Return true for any error, and false otherwise. + */ +-static blk_status_t wait_dev_flush(struct btrfs_device *device) ++static bool wait_dev_flush(struct btrfs_device *device) + { + struct bio *bio = &device->flush_bio; + +- if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) +- return BLK_STS_OK; ++ if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) ++ return false; + +- clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); + wait_for_completion_io(&device->flush_wait); + +- return bio->bi_status; +-} ++ if (bio->bi_status) { ++ device->last_flush_error = bio->bi_status; ++ btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); ++ return true; ++ } + +-static int check_barrier_error(struct btrfs_fs_info *fs_info) +-{ +- if (!btrfs_check_rw_degradable(fs_info, NULL)) +- return -EIO; +- return 0; ++ return false; + } + + /* +@@ -4152,7 +4145,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) + struct list_head *head; + struct btrfs_device *dev; + int errors_wait = 0; +- blk_status_t ret; + + lockdep_assert_held(&info->fs_devices->device_list_mutex); + /* send down all the barriers */ +@@ -4167,7 +4159,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info) + continue; + + write_dev_flush(dev); +- dev->last_flush_error = BLK_STS_OK; + } + + /* wait for all the barriers */ +@@ -4182,23 +4173,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info) + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) + continue; + +- ret = wait_dev_flush(dev); +- if (ret) { +- dev->last_flush_error = ret; +- btrfs_dev_stat_inc_and_print(dev, +- BTRFS_DEV_STAT_FLUSH_ERRS); ++ if (wait_dev_flush(dev)) + errors_wait++; +- } + } + +- if (errors_wait) { +- /* +- * At some point we need the status of all disks +- * to arrive at the volume status. So error checking +- * is being pushed to a separate loop. +- */ +- return check_barrier_error(info); +- } ++ /* ++ * Checks last_flush_error of disks in order to determine the device ++ * state. ++ */ ++ if (errors_wait && !btrfs_check_rw_degradable(info, NULL)) ++ return -EIO; ++ + return 0; + } + +@@ -4404,12 +4389,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) +- break; ++ goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +- ++out: + /* release the uncleaned roots due to error */ + for (; i < ret; i++) { + if (gang[i]) +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 824c657f59e8..5cd289de4e92 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( + } + + static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, +- struct btrfs_delayed_ref_head *locked_ref, +- unsigned long *run_refs) ++ struct btrfs_delayed_ref_head *locked_ref) + { + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; +@@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, + return -EAGAIN; + } + +- (*run_refs)++; + ref->in_tree = 0; + rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree); + RB_CLEAR_NODE(&ref->ref_node); +@@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_delayed_ref_head *locked_ref = NULL; +- ktime_t start = ktime_get(); + int ret; + unsigned long count = 0; +- unsigned long actual_count = 0; + + delayed_refs = &trans->transaction->delayed_refs; + do { +@@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + spin_lock(&locked_ref->lock); + btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); + +- ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, +- &actual_count); ++ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + if (ret < 0 && ret != -EAGAIN) { + /* + * Error, btrfs_run_delayed_refs_for_head already +@@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + cond_resched(); + } while ((nr != -1 && count < nr) || locked_ref); + +- /* +- * We don't want to include ref heads since we can have empty ref heads +- * and those will drastically skew our runtime down since we just do +- * accounting, no actual extent tree updates. +- */ +- if (actual_count > 0) { +- u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); +- u64 avg; +- +- /* +- * We weigh the current average higher than our current runtime +- * to avoid large swings in the average. +- */ +- spin_lock(&delayed_refs->lock); +- avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; +- fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ +- spin_unlock(&delayed_refs->lock); +- } + return 0; + } + +@@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + { + int level = wc->level; + int lookup_info = 1; +- int ret; ++ int ret = 0; + + while (level >= 0) { + ret = walk_down_proc(trans, root, path, wc, lookup_info); +- if (ret > 0) ++ if (ret) + break; + + if (level == 0) +@@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + path->slots[level]++; + continue; + } else if (ret < 0) +- return ret; ++ break; + level = wc->level; + } +- return 0; ++ return (ret == 1) ? 0 : ret; + } + + static noinline int walk_up_tree(struct btrfs_trans_handle *trans, +@@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) + + ret = walk_down_tree(trans, root, path, wc); + if (ret < 0) { ++ btrfs_abort_transaction(trans, ret); + err = ret; + break; + } + + ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL); + if (ret < 0) { ++ btrfs_abort_transaction(trans, ret); + err = ret; + break; + } +diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c +index 40300e8e5f99..a1adadd5d25d 100644 +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) + * how many bytes are there before stripe/ordered extent boundary. + */ + struct btrfs_bio_ctrl { +- struct bio *bio; ++ struct btrfs_bio *bbio; + int mirror_num; + enum btrfs_compression_type compress_type; + u32 len_to_oe_boundary; ++ blk_opf_t opf; + btrfs_bio_end_io_t end_io_func; ++ struct writeback_control *wbc; + + /* + * This is for metadata read, to provide the extra needed verification +@@ -117,51 +119,41 @@ struct btrfs_bio_ctrl { + * does the unlocking. + */ + bool extent_locked; +- +- /* Tell the submit_bio code to use REQ_SYNC */ +- bool sync_io; + }; + + static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) + { +- struct bio *bio; +- struct bio_vec *bv; +- struct inode *inode; +- int mirror_num; ++ struct btrfs_bio *bbio = bio_ctrl->bbio; ++ int mirror_num = bio_ctrl->mirror_num; + +- if (!bio_ctrl->bio) ++ if (!bbio) + return; + +- bio = bio_ctrl->bio; +- bv = bio_first_bvec_all(bio); +- inode = bv->bv_page->mapping->host; +- mirror_num = bio_ctrl->mirror_num; +- + /* Caller should ensure the bio has at least some range added */ +- ASSERT(bio->bi_iter.bi_size); ++ ASSERT(bbio->bio.bi_iter.bi_size); + +- if (!is_data_inode(inode)) { +- if (btrfs_op(bio) != BTRFS_MAP_WRITE) { ++ if (!is_data_inode(&bbio->inode->vfs_inode)) { ++ if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) { + /* + * For metadata read, we should have the parent_check, + * and copy it to bbio for metadata verification. + */ + ASSERT(bio_ctrl->parent_check); +- memcpy(&btrfs_bio(bio)->parent_check, ++ memcpy(&bbio->parent_check, + bio_ctrl->parent_check, + sizeof(struct btrfs_tree_parent_check)); + } +- bio->bi_opf |= REQ_META; ++ bbio->bio.bi_opf |= REQ_META; + } + +- if (btrfs_op(bio) == BTRFS_MAP_READ && ++ if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ && + bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) +- btrfs_submit_compressed_read(inode, bio, mirror_num); ++ btrfs_submit_compressed_read(bbio, mirror_num); + else +- btrfs_submit_bio(bio, mirror_num); ++ btrfs_submit_bio(bbio, mirror_num); + +- /* The bio is owned by the end_io handler now */ +- bio_ctrl->bio = NULL; ++ /* The bbio is owned by the end_io handler now */ ++ bio_ctrl->bbio = NULL; + } + + /* +@@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) + */ + static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) + { +- struct bio *bio = bio_ctrl->bio; ++ struct btrfs_bio *bbio = bio_ctrl->bbio; + +- if (!bio) ++ if (!bbio) + return; + + if (ret) { + ASSERT(ret < 0); +- btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); ++ btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); + /* The bio is owned by the end_io handler now */ +- bio_ctrl->bio = NULL; ++ bio_ctrl->bbio = NULL; + } else { + submit_one_bio(bio_ctrl); + } +@@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) + return 0; + } + +-/* +- * Attempt to add a page to bio. +- * +- * @bio_ctrl: record both the bio, and its bio_flags +- * @page: page to add to the bio +- * @disk_bytenr: offset of the new bio or to check whether we are adding +- * a contiguous page to the previous one +- * @size: portion of page that we want to write +- * @pg_offset: starting offset in the page +- * @compress_type: compression type of the current bio to see if we can merge them +- * +- * Attempt to add a page to bio considering stripe alignment etc. +- * +- * Return >= 0 for the number of bytes added to the bio. +- * Can return 0 if the current bio is already at stripe/zone boundary. +- * Return <0 for error. +- */ +-static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, +- struct page *page, +- u64 disk_bytenr, unsigned int size, +- unsigned int pg_offset, +- enum btrfs_compression_type compress_type) ++static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, ++ struct page *page, u64 disk_bytenr, ++ unsigned int pg_offset) + { +- struct bio *bio = bio_ctrl->bio; +- u32 bio_size = bio->bi_iter.bi_size; +- u32 real_size; ++ struct bio *bio = &bio_ctrl->bbio->bio; ++ struct bio_vec *bvec = bio_last_bvec_all(bio); + const sector_t sector = disk_bytenr >> SECTOR_SHIFT; +- bool contig = false; + +- ASSERT(bio); +- /* The limit should be calculated when bio_ctrl->bio is allocated */ +- ASSERT(bio_ctrl->len_to_oe_boundary); +- if (bio_ctrl->compress_type != compress_type) +- return 0; +- +- +- if (bio->bi_iter.bi_size == 0) { +- /* We can always add a page into an empty bio. */ +- contig = true; +- } else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) { +- struct bio_vec *bvec = bio_last_bvec_all(bio); +- +- /* +- * The contig check requires the following conditions to be met: +- * 1) The pages are belonging to the same inode +- * This is implied by the call chain. +- * +- * 2) The range has adjacent logical bytenr +- * +- * 3) The range has adjacent file offset +- * This is required for the usage of btrfs_bio->file_offset. +- */ +- if (bio_end_sector(bio) == sector && +- page_offset(bvec->bv_page) + bvec->bv_offset + +- bvec->bv_len == page_offset(page) + pg_offset) +- contig = true; +- } else { ++ if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { + /* +- * For compression, all IO should have its logical bytenr +- * set to the starting bytenr of the compressed extent. ++ * For compression, all IO should have its logical bytenr set ++ * to the starting bytenr of the compressed extent. + */ +- contig = bio->bi_iter.bi_sector == sector; ++ return bio->bi_iter.bi_sector == sector; + } + +- if (!contig) +- return 0; +- +- real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size); +- + /* +- * If real_size is 0, never call bio_add_*_page(), as even size is 0, +- * bio will still execute its endio function on the page! ++ * The contig check requires the following conditions to be met: ++ * ++ * 1) The pages are belonging to the same inode ++ * This is implied by the call chain. ++ * ++ * 2) The range has adjacent logical bytenr ++ * ++ * 3) The range has adjacent file offset ++ * This is required for the usage of btrfs_bio->file_offset. + */ +- if (real_size == 0) +- return 0; +- +- return bio_add_page(bio, page, real_size, pg_offset); ++ return bio_end_sector(bio) == sector && ++ page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len == ++ page_offset(page) + pg_offset; + } + +-static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, +- struct btrfs_inode *inode, u64 file_offset) ++static void alloc_new_bio(struct btrfs_inode *inode, ++ struct btrfs_bio_ctrl *bio_ctrl, ++ u64 disk_bytenr, u64 file_offset) + { +- struct btrfs_ordered_extent *ordered; ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct btrfs_bio *bbio; ++ ++ bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info, ++ bio_ctrl->end_io_func, NULL); ++ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; ++ bbio->inode = inode; ++ bbio->file_offset = file_offset; ++ bio_ctrl->bbio = bbio; ++ bio_ctrl->len_to_oe_boundary = U32_MAX; + + /* + * Limit the extent to the ordered boundary for Zone Append. +@@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, + * them. + */ + if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE && +- btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) { ++ btrfs_use_zone_append(bbio)) { ++ struct btrfs_ordered_extent *ordered; ++ + ordered = btrfs_lookup_ordered_extent(inode, file_offset); + if (ordered) { + bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX, + ordered->file_offset + + ordered->disk_num_bytes - file_offset); + btrfs_put_ordered_extent(ordered); +- return; + } + } + +- bio_ctrl->len_to_oe_boundary = U32_MAX; +-} +- +-static void alloc_new_bio(struct btrfs_inode *inode, +- struct btrfs_bio_ctrl *bio_ctrl, +- struct writeback_control *wbc, blk_opf_t opf, +- u64 disk_bytenr, u32 offset, u64 file_offset, +- enum btrfs_compression_type compress_type) +-{ +- struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct bio *bio; +- +- bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func, +- NULL); +- /* +- * For compressed page range, its disk_bytenr is always @disk_bytenr +- * passed in, no matter if we have added any range into previous bio. +- */ +- if (compress_type != BTRFS_COMPRESS_NONE) +- bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; +- else +- bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; +- btrfs_bio(bio)->file_offset = file_offset; +- bio_ctrl->bio = bio; +- bio_ctrl->compress_type = compress_type; +- calc_bio_boundaries(bio_ctrl, inode, file_offset); +- +- if (wbc) { ++ if (bio_ctrl->wbc) { + /* + * Pick the last added device to support cgroup writeback. For + * multi-device file systems this means blk-cgroup policies have + * to always be set on the last added/replaced device. + * This is a bit odd but has been like that for a long time. + */ +- bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); +- wbc_init_bio(wbc, bio); ++ bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); ++ wbc_init_bio(bio_ctrl->wbc, &bbio->bio); + } + } + + /* +- * @opf: bio REQ_OP_* and REQ_* flags as one value +- * @wbc: optional writeback control for io accounting + * @disk_bytenr: logical bytenr where the write will be + * @page: page to add to the bio + * @size: portion of page that we want to write to + * @pg_offset: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one +- * @compress_type: compress type for current bio + * +- * The will either add the page into the existing @bio_ctrl->bio, or allocate a +- * new one in @bio_ctrl->bio. ++ * The will either add the page into the existing @bio_ctrl->bbio, or allocate a ++ * new one in @bio_ctrl->bbio. + * The mirror number for this IO should already be initizlied in + * @bio_ctrl->mirror_num. + */ +-static int submit_extent_page(blk_opf_t opf, +- struct writeback_control *wbc, +- struct btrfs_bio_ctrl *bio_ctrl, +- u64 disk_bytenr, struct page *page, +- size_t size, unsigned long pg_offset, +- enum btrfs_compression_type compress_type, +- bool force_bio_submit) ++static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, ++ u64 disk_bytenr, struct page *page, ++ size_t size, unsigned long pg_offset) + { + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); +- unsigned int cur = pg_offset; +- +- ASSERT(bio_ctrl); +- +- ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && +- pg_offset + size <= PAGE_SIZE); + ++ ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(bio_ctrl->end_io_func); + +- if (force_bio_submit) ++ if (bio_ctrl->bbio && ++ !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset)) + submit_one_bio(bio_ctrl); + +- while (cur < pg_offset + size) { +- u32 offset = cur - pg_offset; +- int added; ++ do { ++ u32 len = size; + + /* Allocate new bio if needed */ +- if (!bio_ctrl->bio) { +- alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr, +- offset, page_offset(page) + cur, +- compress_type); ++ if (!bio_ctrl->bbio) { ++ alloc_new_bio(inode, bio_ctrl, disk_bytenr, ++ page_offset(page) + pg_offset); + } +- /* +- * We must go through btrfs_bio_add_page() to ensure each +- * page range won't cross various boundaries. +- */ +- if (compress_type != BTRFS_COMPRESS_NONE) +- added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, +- size - offset, pg_offset + offset, +- compress_type); +- else +- added = btrfs_bio_add_page(bio_ctrl, page, +- disk_bytenr + offset, size - offset, +- pg_offset + offset, compress_type); +- +- /* Metadata page range should never be split */ +- if (!is_data_inode(&inode->vfs_inode)) +- ASSERT(added == 0 || added == size - offset); +- +- /* At least we added some page, update the account */ +- if (wbc && added) +- wbc_account_cgroup_owner(wbc, page, added); +- +- /* We have reached boundary, submit right now */ +- if (added < size - offset) { +- /* The bio should contain some page(s) */ +- ASSERT(bio_ctrl->bio->bi_iter.bi_size); ++ ++ /* Cap to the current ordered extent boundary if there is one. */ ++ if (len > bio_ctrl->len_to_oe_boundary) { ++ ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE); ++ ASSERT(is_data_inode(&inode->vfs_inode)); ++ len = bio_ctrl->len_to_oe_boundary; ++ } ++ ++ if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) { ++ /* bio full: move on to a new one */ + submit_one_bio(bio_ctrl); ++ continue; + } +- cur += added; +- } +- return 0; ++ ++ if (bio_ctrl->wbc) ++ wbc_account_cgroup_owner(bio_ctrl->wbc, page, len); ++ ++ size -= len; ++ pg_offset += len; ++ disk_bytenr += len; ++ bio_ctrl->len_to_oe_boundary -= len; ++ ++ /* Ordered extent boundary: move on to a new bio. */ ++ if (bio_ctrl->len_to_oe_boundary == 0) ++ submit_one_bio(bio_ctrl); ++ } while (size); + } + + static int attach_extent_buffer_page(struct extent_buffer *eb, +@@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, + * return 0 on success, otherwise return error + */ + static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +- struct btrfs_bio_ctrl *bio_ctrl, +- blk_opf_t read_flags, u64 *prev_em_start) ++ struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start) + { + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +@@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + unlock_extent(tree, start, end, NULL); + btrfs_page_set_error(fs_info, page, start, PAGE_SIZE); + unlock_page(page); +- goto out; ++ return ret; + } + + if (page->index == last_byte >> PAGE_SHIFT) { +@@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + bio_ctrl->end_io_func = end_bio_extent_readpage; + begin_page_read(fs_info, page); + while (cur <= end) { +- unsigned long this_bio_flag = 0; ++ enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + bool force_bio_submit = false; + u64 disk_bytenr; + +@@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + if (IS_ERR(em)) { + unlock_extent(tree, cur, end, NULL); + end_page_read(page, false, cur, end + 1 - cur); +- ret = PTR_ERR(em); +- break; ++ return PTR_ERR(em); + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) +- this_bio_flag = em->compress_type; ++ compress_type = em->compress_type; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = ALIGN(iosize, blocksize); +- if (this_bio_flag != BTRFS_COMPRESS_NONE) ++ if (compress_type != BTRFS_COMPRESS_NONE) + disk_bytenr = em->block_start; + else + disk_bytenr = em->block_start + extent_offset; +@@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + continue; + } + +- ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, +- bio_ctrl, disk_bytenr, page, iosize, +- pg_offset, this_bio_flag, +- force_bio_submit); +- if (ret) { +- /* +- * We have to unlock the remaining range, or the page +- * will never be unlocked. +- */ +- unlock_extent(tree, cur, end, NULL); +- end_page_read(page, false, cur, end + 1 - cur); +- goto out; ++ if (bio_ctrl->compress_type != compress_type) { ++ submit_one_bio(bio_ctrl); ++ bio_ctrl->compress_type = compress_type; + } ++ ++ if (force_bio_submit) ++ submit_one_bio(bio_ctrl); ++ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, ++ pg_offset); + cur = cur + iosize; + pg_offset += iosize; + } +-out: +- return ret; ++ ++ return 0; + } + + int btrfs_read_folio(struct file *file, struct folio *folio) +@@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio) + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; +- struct btrfs_bio_ctrl bio_ctrl = { 0 }; ++ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + +- ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); ++ ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. +@@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, + + for (index = 0; index < nr_pages; index++) { + btrfs_do_readpage(pages[index], em_cached, bio_ctrl, +- REQ_RAHEAD, prev_em_start); ++ prev_em_start); + put_page(pages[index]); + } + } +@@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, + */ + static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + struct page *page, +- struct writeback_control *wbc, + struct btrfs_bio_ctrl *bio_ctrl, + loff_t i_size, + int *nr_ret) +@@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + u64 extent_offset; + u64 block_start; + struct extent_map *em; +- int saved_ret = 0; + int ret = 0; + int nr = 0; +- enum req_op op = REQ_OP_WRITE; +- const blk_opf_t write_flags = wbc_to_write_flags(wbc); +- bool has_error = false; + bool compressed; + + ret = btrfs_writepage_cow_fixup(page); + if (ret) { + /* Fixup worker will requeue */ +- redirty_page_for_writepage(wbc, page); ++ redirty_page_for_writepage(bio_ctrl->wbc, page); + unlock_page(page); + return 1; + } +@@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + * we don't want to touch the inode after unlocking the page, + * so we update the mapping writeback index now + */ +- wbc->nr_to_write--; ++ bio_ctrl->wbc->nr_to_write--; + + bio_ctrl->end_io_func = end_bio_extent_writepage; + while (cur <= end) { +@@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + if (IS_ERR(em)) { + btrfs_page_set_error(fs_info, page, cur, end - cur + 1); + ret = PTR_ERR_OR_ZERO(em); +- has_error = true; +- if (!saved_ret) +- saved_ret = ret; +- break; ++ goto out_error; + } + + extent_offset = cur - em->start; +@@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + */ + btrfs_page_clear_dirty(fs_info, page, cur, iosize); + +- ret = submit_extent_page(op | write_flags, wbc, +- bio_ctrl, disk_bytenr, +- page, iosize, +- cur - page_offset(page), +- 0, false); +- if (ret) { +- has_error = true; +- if (!saved_ret) +- saved_ret = ret; +- +- btrfs_page_set_error(fs_info, page, cur, iosize); +- if (PageWriteback(page)) +- btrfs_page_clear_writeback(fs_info, page, cur, +- iosize); +- } +- ++ submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, ++ cur - page_offset(page)); + cur += iosize; + nr++; + } ++ ++ btrfs_page_assert_not_dirty(fs_info, page); ++ *nr_ret = nr; ++ return 0; ++ ++out_error: + /* + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ +- if (!has_error) +- btrfs_page_assert_not_dirty(fs_info, page); +- else +- ret = saved_ret; + *nr_ret = nr; + return ret; + } +@@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, + * Return 0 if everything goes well. + * Return <0 for error. + */ +-static int __extent_writepage(struct page *page, struct writeback_control *wbc, +- struct btrfs_bio_ctrl *bio_ctrl) ++static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) + { + struct folio *folio = page_folio(page); + struct inode *inode = page->mapping->host; +@@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_SHIFT; + +- trace___extent_writepage(page, inode, wbc); ++ trace___extent_writepage(page, inode, bio_ctrl->wbc); + + WARN_ON(!PageLocked(page)); + +@@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + } + + if (!bio_ctrl->extent_locked) { +- ret = writepage_delalloc(BTRFS_I(inode), page, wbc); ++ ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc); + if (ret == 1) + return 0; + if (ret) + goto done; + } + +- ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, +- &nr); ++ ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr); + if (ret == 1) + return 0; + +@@ -1773,6 +1657,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, + if (PageError(page)) + end_extent_writepage(page, ret, page_start, page_end); + if (bio_ctrl->extent_locked) { ++ struct writeback_control *wbc = bio_ctrl->wbc; ++ + /* + * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), + * the page can either be locked by lock_page() or +@@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb + + if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { + btrfs_tree_unlock(eb); +- if (!bio_ctrl->sync_io) ++ if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL) + return 0; + if (!flush) { + submit_write_bio(bio_ctrl, 0); +@@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb) + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utilized at minimum to keep the VMM code happy. + */ +-static int write_one_subpage_eb(struct extent_buffer *eb, +- struct writeback_control *wbc, +- struct btrfs_bio_ctrl *bio_ctrl) ++static void write_one_subpage_eb(struct extent_buffer *eb, ++ struct btrfs_bio_ctrl *bio_ctrl) + { + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; +- blk_opf_t write_flags = wbc_to_write_flags(wbc); + bool no_dirty_ebs = false; +- int ret; + + prepare_eb_write(eb); + +@@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb, + + bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; + +- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, +- bio_ctrl, eb->start, page, eb->len, +- eb->start - page_offset(page), 0, false); +- if (ret) { +- btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); +- set_btree_ioerr(page, eb); +- unlock_page(page); +- +- if (atomic_dec_and_test(&eb->io_pages)) +- end_extent_buffer_writeback(eb); +- return -EIO; +- } ++ submit_extent_page(bio_ctrl, eb->start, page, eb->len, ++ eb->start - page_offset(page)); + unlock_page(page); + /* + * Submission finished without problem, if no range of the page is + * dirty anymore, we have submitted a page. Update nr_written in wbc. + */ + if (no_dirty_ebs) +- wbc->nr_to_write--; +- return ret; ++ bio_ctrl->wbc->nr_to_write--; + } + +-static noinline_for_stack int write_one_eb(struct extent_buffer *eb, +- struct writeback_control *wbc, ++static noinline_for_stack void write_one_eb(struct extent_buffer *eb, + struct btrfs_bio_ctrl *bio_ctrl) + { + u64 disk_bytenr = eb->start; + int i, num_pages; +- blk_opf_t write_flags = wbc_to_write_flags(wbc); +- int ret = 0; + + prepare_eb_write(eb); + +@@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, + + clear_page_dirty_for_io(p); + set_page_writeback(p); +- ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, +- bio_ctrl, disk_bytenr, p, +- PAGE_SIZE, 0, 0, false); +- if (ret) { +- set_btree_ioerr(p, eb); +- if (PageWriteback(p)) +- end_page_writeback(p); +- if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) +- end_extent_buffer_writeback(eb); +- ret = -EIO; +- break; +- } ++ submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0); + disk_bytenr += PAGE_SIZE; +- wbc->nr_to_write--; ++ bio_ctrl->wbc->nr_to_write--; + unlock_page(p); + } +- +- if (unlikely(ret)) { +- for (; i < num_pages; i++) { +- struct page *p = eb->pages[i]; +- clear_page_dirty_for_io(p); +- unlock_page(p); +- } +- } +- +- return ret; + } + + /* +@@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +-static int submit_eb_subpage(struct page *page, +- struct writeback_control *wbc, +- struct btrfs_bio_ctrl *bio_ctrl) ++static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl) + { + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; +@@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page, + free_extent_buffer(eb); + goto cleanup; + } +- ret = write_one_subpage_eb(eb, wbc, bio_ctrl); ++ write_one_subpage_eb(eb, bio_ctrl); + free_extent_buffer(eb); +- if (ret < 0) +- goto cleanup; + submitted++; + } + return submitted; +@@ -2318,8 +2162,7 @@ static int submit_eb_subpage(struct page *page, + * previous call. + * Return <0 for fatal error. + */ +-static int submit_eb_page(struct page *page, struct writeback_control *wbc, +- struct btrfs_bio_ctrl *bio_ctrl, ++static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl, + struct extent_buffer **eb_context) + { + struct address_space *mapping = page->mapping; +@@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, + return 0; + + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) +- return submit_eb_subpage(page, wbc, bio_ctrl); ++ return submit_eb_subpage(page, bio_ctrl); + + spin_lock(&mapping->private_lock); + if (!PagePrivate(page)) { +@@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, + * If for_sync, this hole will be filled with + * trasnsaction commit. + */ +- if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) ++ if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL && ++ !bio_ctrl->wbc->for_sync) + ret = -EAGAIN; + else + ret = 0; +@@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, + btrfs_schedule_zone_finish_bg(cache, eb); + btrfs_put_block_group(cache); + } +- ret = write_one_eb(eb, wbc, bio_ctrl); ++ write_one_eb(eb, bio_ctrl); + free_extent_buffer(eb); +- if (ret < 0) +- return ret; + return 1; + } + +@@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping, + { + struct extent_buffer *eb_context = NULL; + struct btrfs_bio_ctrl bio_ctrl = { ++ .wbc = wbc, ++ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + .extent_locked = 0, +- .sync_io = (wbc->sync_mode == WB_SYNC_ALL), + }; + struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; + int ret = 0; +@@ -2445,8 +2288,7 @@ int btree_write_cache_pages(struct address_space *mapping, + for (i = 0; i < nr_folios; i++) { + struct folio *folio = fbatch.folios[i]; + +- ret = submit_eb_page(&folio->page, wbc, &bio_ctrl, +- &eb_context); ++ ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context); + if (ret == 0) + continue; + if (ret < 0) { +@@ -2529,9 +2371,9 @@ int btree_write_cache_pages(struct address_space *mapping, + * existing IO to complete. + */ + static int extent_write_cache_pages(struct address_space *mapping, +- struct writeback_control *wbc, + struct btrfs_bio_ctrl *bio_ctrl) + { ++ struct writeback_control *wbc = bio_ctrl->wbc; + struct inode *inode = mapping->host; + int ret = 0; + int done = 0; +@@ -2632,7 +2474,7 @@ static int extent_write_cache_pages(struct address_space *mapping, + continue; + } + +- ret = __extent_writepage(&folio->page, wbc, bio_ctrl); ++ ret = __extent_writepage(&folio->page, bio_ctrl); + if (ret < 0) { + done = 1; + break; +@@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) + u64 cur = start; + unsigned long nr_pages; + const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; +- struct btrfs_bio_ctrl bio_ctrl = { +- .extent_locked = 1, +- .sync_io = 1, +- }; + struct writeback_control wbc_writepages = { + .sync_mode = WB_SYNC_ALL, + .range_start = start, + .range_end = end + 1, +- /* We're called from an async helper function */ +- .punt_to_cgroup = 1, + .no_cgroup_owner = 1, + }; ++ struct btrfs_bio_ctrl bio_ctrl = { ++ .wbc = &wbc_writepages, ++ /* We're called from an async helper function */ ++ .opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT | ++ wbc_to_write_flags(&wbc_writepages), ++ .extent_locked = 1, ++ }; + + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> +@@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) + ASSERT(PageLocked(page)); + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); +- ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); ++ ret = __extent_writepage(page, &bio_ctrl); + ASSERT(ret <= 0); + if (ret < 0) { + found_error = true; +@@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping, + struct inode *inode = mapping->host; + int ret = 0; + struct btrfs_bio_ctrl bio_ctrl = { ++ .wbc = wbc, ++ .opf = REQ_OP_WRITE | wbc_to_write_flags(wbc), + .extent_locked = 0, +- .sync_io = (wbc->sync_mode == WB_SYNC_ALL), + }; + + /* +@@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping, + * protect the write pointer updates. + */ + btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); +- ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); ++ ret = extent_write_cache_pages(mapping, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); + return ret; +@@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping, + + void extent_readahead(struct readahead_control *rac) + { +- struct btrfs_bio_ctrl bio_ctrl = { 0 }; ++ struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD }; + struct page *pagepool[16]; + struct extent_map *em_cached = NULL; + u64 prev_em_start = (u64)-1; +@@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + struct page *page = eb->pages[0]; + struct extent_state *cached_state = NULL; + struct btrfs_bio_ctrl bio_ctrl = { ++ .opf = REQ_OP_READ, + .mirror_num = mirror_num, + .parent_check = check, + }; +- int ret = 0; ++ int ret; + + ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); + ASSERT(PagePrivate(page)); +@@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + return ret; + } + +- ret = 0; + if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) || + PageUptodate(page) || + btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { + set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); +- return ret; ++ return 0; + } + + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); +@@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, + btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); + + btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); +- ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, +- eb->start, page, eb->len, +- eb->start - page_offset(page), 0, true); +- if (ret) { +- /* +- * In the endio function, if we hit something wrong we will +- * increase the io_pages, so here we need to decrease it for +- * error path. +- */ +- atomic_dec(&eb->io_pages); +- } ++ submit_extent_page(&bio_ctrl, eb->start, page, eb->len, ++ eb->start - page_offset(page)); + submit_one_bio(&bio_ctrl); +- if (ret || wait != WAIT_COMPLETE) { ++ if (wait != WAIT_COMPLETE) { + free_extent_state(cached_state); +- return ret; ++ return 0; + } + + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, + EXTENT_LOCKED, &cached_state); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) +- ret = -EIO; +- return ret; ++ return -EIO; ++ return 0; + } + + int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, +@@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + { + int i; + struct page *page; +- int err; +- int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int num_pages; + unsigned long num_reads = 0; + struct btrfs_bio_ctrl bio_ctrl = { ++ .opf = REQ_OP_READ, + .mirror_num = mirror_num, + .parent_check = check, + }; +@@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + page = eb->pages[i]; + + if (!PageUptodate(page)) { +- if (ret) { +- atomic_dec(&eb->io_pages); +- unlock_page(page); +- continue; +- } +- + ClearPageError(page); +- err = submit_extent_page(REQ_OP_READ, NULL, +- &bio_ctrl, page_offset(page), page, +- PAGE_SIZE, 0, 0, false); +- if (err) { +- /* +- * We failed to submit the bio so it's the +- * caller's responsibility to perform cleanup +- * i.e unlock page/set error bit. +- */ +- ret = err; +- SetPageError(page); +- unlock_page(page); +- atomic_dec(&eb->io_pages); +- } ++ submit_extent_page(&bio_ctrl, page_offset(page), page, ++ PAGE_SIZE, 0); + } else { + unlock_page(page); + } +@@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + + submit_one_bio(&bio_ctrl); + +- if (ret || wait != WAIT_COMPLETE) +- return ret; ++ if (wait != WAIT_COMPLETE) ++ return 0; + + for (i = 0; i < num_pages; i++) { + page = eb->pages[i]; + wait_on_page_locked(page); + if (!PageUptodate(page)) +- ret = -EIO; ++ return -EIO; + } + +- return ret; ++ return 0; + + unlock_exit: + while (locked_pages > 0) { +@@ -4596,7 +4412,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + page = eb->pages[locked_pages]; + unlock_page(page); + } +- return ret; ++ return 0; + } + + static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, +diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c +index 41c77a100853..018c711a0bc8 100644 +--- a/fs/btrfs/file-item.c ++++ b/fs/btrfs/file-item.c +@@ -335,48 +335,6 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info, + return ret; + } + +-/* +- * Locate the file_offset of @cur_disk_bytenr of a @bio. +- * +- * Bio of btrfs represents read range of +- * [bi_sector << 9, bi_sector << 9 + bi_size). +- * Knowing this, we can iterate through each bvec to locate the page belong to +- * @cur_disk_bytenr and get the file offset. +- * +- * @inode is used to determine if the bvec page really belongs to @inode. +- * +- * Return 0 if we can't find the file offset +- * Return >0 if we find the file offset and restore it to @file_offset_ret +- */ +-static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, +- u64 disk_bytenr, u64 *file_offset_ret) +-{ +- struct bvec_iter iter; +- struct bio_vec bvec; +- u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT; +- int ret = 0; +- +- bio_for_each_segment(bvec, bio, iter) { +- struct page *page = bvec.bv_page; +- +- if (cur > disk_bytenr) +- break; +- if (cur + bvec.bv_len <= disk_bytenr) { +- cur += bvec.bv_len; +- continue; +- } +- ASSERT(in_range(disk_bytenr, cur, bvec.bv_len)); +- if (page->mapping && page->mapping->host && +- page->mapping->host == inode) { +- ret = 1; +- *file_offset_ret = page_offset(page) + bvec.bv_offset + +- disk_bytenr - cur; +- break; +- } +- } +- return ret; +-} +- + /* + * Lookup the checksum for the read bio in csum tree. + * +@@ -386,17 +344,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) + { + struct btrfs_inode *inode = bbio->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; +- struct extent_io_tree *io_tree = &inode->io_tree; + struct bio *bio = &bbio->bio; + struct btrfs_path *path; + const u32 sectorsize = fs_info->sectorsize; + const u32 csum_size = fs_info->csum_size; + u32 orig_len = bio->bi_iter.bi_size; + u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; +- u64 cur_disk_bytenr; + const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; +- int count = 0; + blk_status_t ret = BLK_STS_OK; ++ u32 bio_offset = 0; + + if ((inode->flags & BTRFS_INODE_NODATASUM) || + test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) +@@ -447,28 +403,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) + path->skip_locking = 1; + } + +- for (cur_disk_bytenr = orig_disk_bytenr; +- cur_disk_bytenr < orig_disk_bytenr + orig_len; +- cur_disk_bytenr += (count * sectorsize)) { +- u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr; +- unsigned int sector_offset; +- u8 *csum_dst; +- +- /* +- * Although both cur_disk_bytenr and orig_disk_bytenr is u64, +- * we're calculating the offset to the bio start. +- * +- * Bio size is limited to UINT_MAX, thus unsigned int is large +- * enough to contain the raw result, not to mention the right +- * shifted result. +- */ +- ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX); +- sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >> +- fs_info->sectorsize_bits; +- csum_dst = bbio->csum + sector_offset * csum_size; ++ while (bio_offset < orig_len) { ++ int count; ++ u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset; ++ u8 *csum_dst = bbio->csum + ++ (bio_offset >> fs_info->sectorsize_bits) * csum_size; + + count = search_csum_tree(fs_info, path, cur_disk_bytenr, +- search_len, csum_dst); ++ orig_len - bio_offset, csum_dst); + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio->csum != bbio->csum_inline) +@@ -493,14 +435,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) + + if (inode->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { +- u64 file_offset; +- int ret; +- +- ret = search_file_offset_in_bio(bio, +- &inode->vfs_inode, +- cur_disk_bytenr, &file_offset); +- if (ret) +- set_extent_bits(io_tree, file_offset, ++ u64 file_offset = bbio->file_offset + bio_offset; ++ ++ set_extent_bits(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM); + } else { +@@ -509,6 +446,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) + cur_disk_bytenr, cur_disk_bytenr + sectorsize); + } + } ++ bio_offset += count * sectorsize; + } + + btrfs_free_path(path); +@@ -659,7 +597,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + * in is large enough to contain all csums. + */ + int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, +- u8 *csum_buf, unsigned long *csum_bitmap) ++ u8 *csum_buf, unsigned long *csum_bitmap, ++ bool search_commit) + { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; +@@ -676,6 +615,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + if (!path) + return -ENOMEM; + ++ if (search_commit) { ++ path->skip_locking = 1; ++ path->reada = READA_FORWARD; ++ path->search_commit_root = 1; ++ } ++ + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; +diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h +index cd7f2ae515c0..6be8725cd574 100644 +--- a/fs/btrfs/file-item.h ++++ b/fs/btrfs/file-item.h +@@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); + int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, +- u8 *csum_buf, unsigned long *csum_bitmap); ++ u8 *csum_buf, unsigned long *csum_bitmap, ++ bool search_commit); + void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, +diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h +index 24cd49229408..0d98fc5f6f44 100644 +--- a/fs/btrfs/fs.h ++++ b/fs/btrfs/fs.h +@@ -24,6 +24,18 @@ + #define BTRFS_SUPER_INFO_SIZE 4096 + static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + ++/* ++ * Number of metadata items necessary for an unlink operation: ++ * ++ * 1 for the possible orphan item ++ * 1 for the dir item ++ * 1 for the dir index ++ * 1 for the inode ref ++ * 1 for the inode ++ * 1 for the parent inode ++ */ ++#define BTRFS_UNLINK_METADATA_UNITS 6 ++ + /* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like +@@ -193,11 +205,7 @@ enum { + #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL + #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +-#ifdef CONFIG_BTRFS_DEBUG +-/* +- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG +- */ +-#define BTRFS_FEATURE_INCOMPAT_SUPP \ ++#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ +@@ -210,23 +218,22 @@ enum { + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ +- BTRFS_FEATURE_INCOMPAT_ZONED | \ ++ BTRFS_FEATURE_INCOMPAT_ZONED) ++ ++#ifdef CONFIG_BTRFS_DEBUG ++ /* ++ * Features under developmen like Extent tree v2 support is enabled ++ * only under CONFIG_BTRFS_DEBUG. ++ */ ++#define BTRFS_FEATURE_INCOMPAT_SUPP \ ++ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) ++ + #else +-#define BTRFS_FEATURE_INCOMPAT_SUPP \ +- (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ +- BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ +- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ +- BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ +- BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ +- BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ +- BTRFS_FEATURE_INCOMPAT_RAID56 | \ +- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ +- BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ +- BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ +- BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ +- BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ +- BTRFS_FEATURE_INCOMPAT_ZONED) ++ ++#define BTRFS_FEATURE_INCOMPAT_SUPP \ ++ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE) ++ + #endif + + #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ +@@ -412,7 +419,6 @@ struct btrfs_fs_info { + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; +- u64 avg_delayed_ref_runtime; + + /* + * This is updated to the current trans every time a full commit is +@@ -638,7 +644,6 @@ struct btrfs_fs_info { + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; +- struct workqueue_struct *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; +@@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves( + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, ++static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info, + unsigned num_items) + { + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +@@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, ++static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info, + unsigned num_items) + { + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c +index b65c45b5d681..4c322b720a80 100644 +--- a/fs/btrfs/inode-item.c ++++ b/fs/btrfs/inode-item.c +@@ -527,7 +527,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + + while (1) { + u64 clear_start = 0, clear_len = 0, extent_start = 0; +- bool should_throttle = false; ++ bool refill_delayed_refs_rsv = false; + + fi = NULL; + leaf = path->nodes[0]; +@@ -660,8 +660,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + /* No pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; +- } else if (pending_del_nr && +- path->slots[0] + 1 == pending_del_slot) { ++ } else if (path->slots[0] + 1 == pending_del_slot) { + /* Hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; +@@ -686,10 +685,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + btrfs_abort_transaction(trans, ret); + break; + } +- if (be_nice) { +- if (btrfs_should_throttle_delayed_refs(trans)) +- should_throttle = true; +- } ++ if (be_nice && btrfs_check_space_for_delayed_refs(fs_info)) ++ refill_delayed_refs_rsv = true; + } + + if (found_type == BTRFS_INODE_ITEM_KEY) +@@ -697,7 +694,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot || +- should_throttle) { ++ refill_delayed_refs_rsv) { + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, +@@ -720,7 +717,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + * actually allocate, so just bail if we're short and + * let the normal reservation dance happen higher up. + */ +- if (should_throttle) { ++ if (refill_delayed_refs_rsv) { + ret = btrfs_delayed_refs_rsv_refill(fs_info, + BTRFS_RESERVE_NO_FLUSH); + if (ret) { +diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c +index 957e4d76a7b6..57d070025c7a 100644 +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -79,6 +79,7 @@ struct btrfs_iget_args { + struct btrfs_dio_data { + ssize_t submitted; + struct extent_changeset *data_reserved; ++ struct btrfs_ordered_extent *ordered; + bool data_space_reserved; + bool nocow_done; + }; +@@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) + again: + will_compress = 0; + nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; +- nr_pages = min_t(unsigned long, nr_pages, +- BTRFS_MAX_COMPRESSED / PAGE_SIZE); ++ nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); + + /* + * we don't want to send crud past the end of i_size through +@@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, + ret = cow_file_range(inode, locked_page, start, end, &page_started, + &nr_written, 0, NULL); + /* Inline extent inserted, page gets unlocked and everything is done */ +- if (page_started) { +- ret = 0; +- goto out; +- } ++ if (page_started) ++ return 0; ++ + if (ret < 0) { + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { +@@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, + end_extent_writepage(locked_page, ret, page_start, page_end); + unlock_page(locked_page); + } +- goto out; ++ return ret; + } + +- ret = extent_write_locked_range(&inode->vfs_inode, start, end); + /* All pages will be unlocked, including @locked_page */ +-out: +- kfree(async_extent); +- return ret; ++ return extent_write_locked_range(&inode->vfs_inode, start, end); + } + + static int submit_one_async_extent(struct btrfs_inode *inode, +@@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode, + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + ++ if (async_chunk->blkcg_css) ++ kthread_associate_blkcg(async_chunk->blkcg_css); ++ + /* + * If async_chunk->locked_page is in the async_extent range, we need to + * handle it. +@@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode, + lock_extent(io_tree, start, end, NULL); + + /* We have fall back to uncompressed write */ +- if (!async_extent->pages) +- return submit_uncompressed_range(inode, async_extent, locked_page); ++ if (!async_extent->pages) { ++ ret = submit_uncompressed_range(inode, async_extent, locked_page); ++ goto done; ++ } + + ret = btrfs_reserve_extent(root, async_extent->ram_size, + async_extent->compressed_size, +@@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode, + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK); +- if (btrfs_submit_compressed_write(inode, start, /* file_offset */ ++ ++ btrfs_submit_compressed_write(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* compressed_len */ + async_extent->pages, /* compressed_pages */ + async_extent->nr_pages, +- async_chunk->write_flags, +- async_chunk->blkcg_css, true)) { +- const u64 start = async_extent->start; +- const u64 end = start + async_extent->ram_size - 1; +- +- btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); +- +- extent_clear_unlock_delalloc(inode, start, end, NULL, 0, +- PAGE_END_WRITEBACK | PAGE_SET_ERROR); +- free_async_extent_pages(async_extent); +- } ++ async_chunk->write_flags, true); + *alloc_hint = ins.objectid + ins.offset; ++done: ++ if (async_chunk->blkcg_css) ++ kthread_associate_blkcg(NULL); + kfree(async_extent); + return ret; + +@@ -1086,8 +1081,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode, + PAGE_UNLOCK | PAGE_START_WRITEBACK | + PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); +- kfree(async_extent); +- return ret; ++ goto done; + } + + /* +@@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, + if (blkcg_css != blkcg_root_css) { + css_get(blkcg_css); + async_chunk[i].blkcg_css = blkcg_css; ++ async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; + } else { + async_chunk[i].blkcg_css = NULL; + } +@@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, + } + + /* +- * Split an extent_map at [start, start + len] ++ * Split off the first pre bytes from the extent_map at [start, start + len] + * + * This function is intended to be used only for extract_ordered_extent(). + */ +-static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, +- u64 pre, u64 post) ++static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre) + { + struct extent_map_tree *em_tree = &inode->extent_tree; + struct extent_map *em; + struct extent_map *split_pre = NULL; + struct extent_map *split_mid = NULL; +- struct extent_map *split_post = NULL; + int ret = 0; + unsigned long flags; + +- /* Sanity check */ +- if (pre == 0 && post == 0) +- return 0; ++ ASSERT(pre != 0); ++ ASSERT(pre < len); + + split_pre = alloc_extent_map(); +- if (pre) +- split_mid = alloc_extent_map(); +- if (post) +- split_post = alloc_extent_map(); +- if (!split_pre || (pre && !split_mid) || (post && !split_post)) { ++ if (!split_pre) ++ return -ENOMEM; ++ split_mid = alloc_extent_map(); ++ if (!split_mid) { + ret = -ENOMEM; +- goto out; ++ goto out_free_pre; + } + +- ASSERT(pre + post < len); +- + lock_extent(&inode->io_tree, start, start + len - 1, NULL); + write_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); +@@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, + + /* First, replace the em with a new extent_map starting from * em->start */ + split_pre->start = em->start; +- split_pre->len = (pre ? pre : em->len - post); ++ split_pre->len = pre; + split_pre->orig_start = split_pre->start; + split_pre->block_start = em->block_start; + split_pre->block_len = split_pre->len; +@@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, + + /* + * Now we only have an extent_map at: +- * [em->start, em->start + pre] if pre != 0 +- * [em->start, em->start + em->len - post] if pre == 0 +- */ +- +- if (pre) { +- /* Insert the middle extent_map */ +- split_mid->start = em->start + pre; +- split_mid->len = em->len - pre - post; +- split_mid->orig_start = split_mid->start; +- split_mid->block_start = em->block_start + pre; +- split_mid->block_len = split_mid->len; +- split_mid->orig_block_len = split_mid->block_len; +- split_mid->ram_bytes = split_mid->len; +- split_mid->flags = flags; +- split_mid->compress_type = em->compress_type; +- split_mid->generation = em->generation; +- add_extent_mapping(em_tree, split_mid, 1); +- } +- +- if (post) { +- split_post->start = em->start + em->len - post; +- split_post->len = post; +- split_post->orig_start = split_post->start; +- split_post->block_start = em->block_start + em->len - post; +- split_post->block_len = split_post->len; +- split_post->orig_block_len = split_post->block_len; +- split_post->ram_bytes = split_post->len; +- split_post->flags = flags; +- split_post->compress_type = em->compress_type; +- split_post->generation = em->generation; +- add_extent_mapping(em_tree, split_post, 1); +- } ++ * [em->start, em->start + pre] ++ */ ++ ++ /* Insert the middle extent_map. */ ++ split_mid->start = em->start + pre; ++ split_mid->len = em->len - pre; ++ split_mid->orig_start = split_mid->start; ++ split_mid->block_start = em->block_start + pre; ++ split_mid->block_len = split_mid->len; ++ split_mid->orig_block_len = split_mid->block_len; ++ split_mid->ram_bytes = split_mid->len; ++ split_mid->flags = flags; ++ split_mid->compress_type = em->compress_type; ++ split_mid->generation = em->generation; ++ add_extent_mapping(em_tree, split_mid, 1); + + /* Once for us */ + free_extent_map(em); +@@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, + out_unlock: + write_unlock(&em_tree->lock); + unlock_extent(&inode->io_tree, start, start + len - 1, NULL); +-out: +- free_extent_map(split_pre); + free_extent_map(split_mid); +- free_extent_map(split_post); +- ++out_free_pre: ++ free_extent_map(split_pre); + return ret; + } + +-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio) ++int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, ++ struct btrfs_ordered_extent *ordered) + { + u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; + u64 len = bbio->bio.bi_iter.bi_size; + struct btrfs_inode *inode = bbio->inode; +- struct btrfs_ordered_extent *ordered; +- u64 file_len; +- u64 end = start + len; +- u64 ordered_end; +- u64 pre, post; ++ u64 ordered_len = ordered->num_bytes; + int ret = 0; + +- ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset); +- if (WARN_ON_ONCE(!ordered)) +- return BLK_STS_IOERR; ++ /* Must always be called for the beginning of an ordered extent. */ ++ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) ++ return -EINVAL; + +- /* No need to split */ ++ /* No need to split if the ordered extent covers the entire bio. */ + if (ordered->disk_num_bytes == len) +- goto out; +- +- /* We cannot split once end_bio'd ordered extent */ +- if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) { +- ret = -EINVAL; +- goto out; +- } +- +- /* We cannot split a compressed ordered extent */ +- if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) { +- ret = -EINVAL; +- goto out; +- } +- +- ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes; +- /* bio must be in one ordered extent */ +- if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) { +- ret = -EINVAL; +- goto out; +- } +- +- /* Checksum list should be empty */ +- if (WARN_ON_ONCE(!list_empty(&ordered->list))) { +- ret = -EINVAL; +- goto out; +- } +- +- file_len = ordered->num_bytes; +- pre = start - ordered->disk_bytenr; +- post = ordered_end - end; ++ return 0; + +- ret = btrfs_split_ordered_extent(ordered, pre, post); ++ ret = btrfs_split_ordered_extent(ordered, len); + if (ret) +- goto out; +- ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post); ++ return ret; + +-out: +- btrfs_put_ordered_extent(ordered); ++ /* ++ * Don't split the extent_map for NOCOW extents, as we're writing into ++ * a pre-existing one. ++ */ ++ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) ++ return 0; + +- return errno_to_blk_status(ret); ++ return split_extent_map(inode, bbio->file_offset, ordered_len, len); + } + + /* +@@ -3367,13 +3308,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + return 0; + } + +-static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset) +-{ +- u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; +- +- return csums + offset_in_sectors * fs_info->csum_size; +-} +- + /* + * Verify the checksum of a single data sector. + * +@@ -3411,7 +3345,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, + return true; + } + +- csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); ++ csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * ++ fs_info->csum_size; + if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, + csum_expected)) + goto zeroit; +@@ -3691,6 +3626,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); ++ iput(inode); + goto out; + } + btrfs_debug(fs_info, "auto deleting %Lu", +@@ -3698,8 +3634,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) + ret = btrfs_del_orphan_item(trans, root, + found_key.objectid); + btrfs_end_transaction(trans); +- if (ret) ++ if (ret) { ++ iput(inode); + goto out; ++ } + continue; + } + +@@ -4261,15 +4199,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) + { + struct btrfs_root *root = dir->root; + +- /* +- * 1 for the possible orphan item +- * 1 for the dir item +- * 1 for the dir index +- * 1 for the inode ref +- * 1 for the inode +- * 1 for the parent inode +- */ +- return btrfs_start_transaction_fallback_global_rsv(root, 6); ++ return btrfs_start_transaction_fallback_global_rsv(root, ++ BTRFS_UNLINK_METADATA_UNITS); + } + + static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +@@ -5243,7 +5174,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, + { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; +- u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1); ++ u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); + int ret; + + /* +@@ -5281,7 +5212,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, + trans->block_rsv = &fs_info->trans_block_rsv; + trans->bytes_reserved = delayed_refs_extra; + btrfs_block_rsv_migrate(rsv, trans->block_rsv, +- delayed_refs_extra, 1); ++ delayed_refs_extra, true); + } + return trans; + } +@@ -5291,7 +5222,7 @@ void btrfs_evict_inode(struct inode *inode) + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; +- struct btrfs_block_rsv *rsv; ++ struct btrfs_block_rsv *rsv = NULL; + int ret; + + trace_btrfs_inode_evict(inode); +@@ -5308,18 +5239,18 @@ void btrfs_evict_inode(struct inode *inode) + ((btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || + btrfs_is_free_space_inode(BTRFS_I(inode)))) +- goto no_delete; ++ goto out; + + if (is_bad_inode(inode)) +- goto no_delete; ++ goto out; + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) +- goto no_delete; ++ goto out; + + if (inode->i_nlink > 0) { + BUG_ON(btrfs_root_refs(&root->root_item) != 0 && + root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); +- goto no_delete; ++ goto out; + } + + /* +@@ -5328,7 +5259,7 @@ void btrfs_evict_inode(struct inode *inode) + */ + ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); + if (ret) +- goto no_delete; ++ goto out; + + /* + * This drops any pending insert or delete operations we have for this +@@ -5340,7 +5271,7 @@ void btrfs_evict_inode(struct inode *inode) + + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); + if (!rsv) +- goto no_delete; ++ goto out; + rsv->size = btrfs_calc_metadata_size(fs_info, 1); + rsv->failfast = true; + +@@ -5356,16 +5287,21 @@ void btrfs_evict_inode(struct inode *inode) + + trans = evict_refill_and_join(root, rsv); + if (IS_ERR(trans)) +- goto free_rsv; ++ goto out; + + trans->block_rsv = rsv; + + ret = btrfs_truncate_inode_items(trans, root, &control); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); +- btrfs_btree_balance_dirty(fs_info); ++ /* ++ * We have not added new delayed items for our inode after we ++ * have flushed its delayed items, so no need to throttle on ++ * delayed items. However we have modified extent buffers. ++ */ ++ btrfs_btree_balance_dirty_nodelay(fs_info); + if (ret && ret != -ENOSPC && ret != -EAGAIN) +- goto free_rsv; ++ goto out; + else if (!ret) + break; + } +@@ -5387,9 +5323,8 @@ void btrfs_evict_inode(struct inode *inode) + btrfs_end_transaction(trans); + } + +-free_rsv: ++out: + btrfs_free_block_rsv(fs_info, rsv); +-no_delete: + /* + * If we didn't successfully delete, the orphan item will still be in + * the tree and we'll retry on the next mount. Again, we might also want +@@ -6981,6 +6916,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + } + + static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, ++ struct btrfs_dio_data *dio_data, + const u64 start, + const u64 len, + const u64 orig_start, +@@ -6991,7 +6927,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + const int type) + { + struct extent_map *em = NULL; +- int ret; ++ struct btrfs_ordered_extent *ordered; + + if (type != BTRFS_ORDERED_NOCOW) { + em = create_io_em(inode, start, len, orig_start, block_start, +@@ -7001,18 +6937,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + if (IS_ERR(em)) + goto out; + } +- ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, +- block_len, 0, +- (1 << type) | +- (1 << BTRFS_ORDERED_DIRECT), +- BTRFS_COMPRESS_NONE); +- if (ret) { ++ ordered = btrfs_alloc_ordered_extent(inode, start, len, len, ++ block_start, block_len, 0, ++ (1 << type) | ++ (1 << BTRFS_ORDERED_DIRECT), ++ BTRFS_COMPRESS_NONE); ++ if (IS_ERR(ordered)) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); + } +- em = ERR_PTR(ret); ++ em = ERR_CAST(ordered); ++ } else { ++ ASSERT(!dio_data->ordered); ++ dio_data->ordered = ordered; + } + out: + +@@ -7020,6 +6959,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + } + + static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, ++ struct btrfs_dio_data *dio_data, + u64 start, u64 len) + { + struct btrfs_root *root = inode->root; +@@ -7035,7 +6975,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + if (ret) + return ERR_PTR(ret); + +- em = btrfs_create_dio_extent(inode, start, ins.offset, start, ++ em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, BTRFS_ORDERED_REGULAR); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); +@@ -7380,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + } + space_reserved = true; + +- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, ++ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); +@@ -7422,7 +7362,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + goto out; + space_reserved = true; + +- em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); ++ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; +@@ -7728,6 +7668,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + pos + length - 1, NULL); + ret = -ENOTBLK; + } ++ if (write) { ++ btrfs_put_ordered_extent(dio_data->ordered); ++ dio_data->ordered = NULL; ++ } + + if (write) + extent_changeset_free(dio_data->data_reserved); +@@ -7767,14 +7711,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, + container_of(bbio, struct btrfs_dio_private, bbio); + struct btrfs_dio_data *dio_data = iter->private; + +- btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); ++ btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, ++ btrfs_dio_end_io, bio->bi_private); ++ bbio->inode = BTRFS_I(iter->inode); + bbio->file_offset = file_offset; + + dip->file_offset = file_offset; + dip->bytes = bio->bi_iter.bi_size; + + dio_data->submitted += bio->bi_iter.bi_size; +- btrfs_submit_bio(bio, 0); ++ ++ /* ++ * Check if we are doing a partial write. If we are, we need to split ++ * the ordered extent to match the submitted bio. Hang on to the ++ * remaining unfinishable ordered_extent in dio_data so that it can be ++ * cancelled in iomap_end to avoid a deadlock wherein faulting the ++ * remaining pages is blocked on the outstanding ordered extent. ++ */ ++ if (iter->flags & IOMAP_WRITE) { ++ int ret; ++ ++ ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); ++ if (ret) { ++ btrfs_bio_end_io(bbio, errno_to_blk_status(ret)); ++ return; ++ } ++ } ++ ++ btrfs_submit_bio(bbio, 0); + } + + static const struct iomap_ops btrfs_dio_iomap_ops = { +@@ -7789,7 +7753,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = { + + ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) + { +- struct btrfs_dio_data data; ++ struct btrfs_dio_data data = { 0 }; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +@@ -7798,7 +7762,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be + struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before) + { +- struct btrfs_dio_data data; ++ struct btrfs_dio_data data = { 0 }; + + return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); +@@ -9908,8 +9872,6 @@ static ssize_t btrfs_encoded_read_inline( + } + + struct btrfs_encoded_read_private { +- struct btrfs_inode *inode; +- u64 file_offset; + wait_queue_head_t wait; + atomic_t pending; + blk_status_t status; +@@ -9939,45 +9901,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) + { ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct btrfs_encoded_read_private priv = { +- .inode = inode, +- .file_offset = file_offset, + .pending = ATOMIC_INIT(1), + }; + unsigned long i = 0; +- u64 cur = 0; ++ struct btrfs_bio *bbio; + + init_waitqueue_head(&priv.wait); +- /* Submit bios for the extent, splitting due to bio limits as necessary. */ +- while (cur < disk_io_size) { +- struct bio *bio = NULL; +- u64 remaining = disk_io_size - cur; +- +- while (bio || remaining) { +- size_t bytes = min_t(u64, remaining, PAGE_SIZE); +- +- if (!bio) { +- bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, +- inode, +- btrfs_encoded_read_endio, +- &priv); +- bio->bi_iter.bi_sector = +- (disk_bytenr + cur) >> SECTOR_SHIFT; +- } + +- if (!bytes || +- bio_add_page(bio, pages[i], bytes, 0) < bytes) { +- atomic_inc(&priv.pending); +- btrfs_submit_bio(bio, 0); +- bio = NULL; +- continue; +- } ++ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, ++ btrfs_encoded_read_endio, &priv); ++ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; ++ bbio->inode = inode; + +- i++; +- cur += bytes; +- remaining -= bytes; ++ do { ++ size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); ++ ++ if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { ++ atomic_inc(&priv.pending); ++ btrfs_submit_bio(bbio, 0); ++ ++ bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, ++ btrfs_encoded_read_endio, &priv); ++ bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; ++ bbio->inode = inode; ++ continue; + } +- } ++ ++ i++; ++ disk_bytenr += bytes; ++ disk_io_size -= bytes; ++ } while (disk_io_size); ++ ++ atomic_inc(&priv.pending); ++ btrfs_submit_bio(bbio, 0); + + if (atomic_dec_return(&priv.pending)) + io_wait_event(priv.wait, !atomic_read(&priv.pending)); +@@ -10398,13 +10356,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + + btrfs_delalloc_release_extents(inode, num_bytes); + +- if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, +- ins.offset, pages, nr_pages, 0, NULL, +- false)) { +- btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0); +- ret = -EIO; +- goto out_pages; +- } ++ btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid, ++ ins.offset, pages, nr_pages, 0, false); + ret = orig_count; + goto out; + +diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c +index ba769a1eb87a..25833b4eeaf5 100644 +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) + if (IS_ERR(sa)) + return PTR_ERR(sa); + ++ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) { ++ ret = -EOPNOTSUPP; ++ goto out; ++ } ++ + if (!(sa->flags & BTRFS_SCRUB_READONLY)) { + ret = mnt_want_write_file(file); + if (ret) +diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c +index 870528d87526..3a496b0d3d2b 100644 +--- a/fs/btrfs/locking.c ++++ b/fs/btrfs/locking.c +@@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) + * acquire the lock. + */ + +-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock) ++void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) + { +- int ret; +- +- ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL); +- if (ret) +- return ret; +- + atomic_set(&lock->readers, 0); ++ atomic_set(&lock->writers, 0); + init_waitqueue_head(&lock->pending_readers); + init_waitqueue_head(&lock->pending_writers); +- +- return 0; +-} +- +-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock) +-{ +- percpu_counter_destroy(&lock->writers); + } + + /* Return true if acquisition is successful, false otherwise */ +@@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) + if (atomic_read(&lock->readers)) + return false; + +- percpu_counter_inc(&lock->writers); ++ atomic_inc(&lock->writers); + + /* Ensure writers count is updated before we check for pending readers */ +- smp_mb(); ++ smp_mb__after_atomic(); + if (atomic_read(&lock->readers)) { + btrfs_drew_write_unlock(lock); + return false; +@@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) + + void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) + { +- percpu_counter_dec(&lock->writers); ++ atomic_dec(&lock->writers); + cond_wake_up(&lock->pending_readers); + } + +@@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) + */ + smp_mb__after_atomic(); + +- wait_event(lock->pending_readers, +- percpu_counter_sum(&lock->writers) == 0); ++ wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); + } + + void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) +diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h +index 11c2269b4b6f..edb9b4a0dba1 100644 +--- a/fs/btrfs/locking.h ++++ b/fs/btrfs/locking.h +@@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) + + struct btrfs_drew_lock { + atomic_t readers; +- struct percpu_counter writers; ++ atomic_t writers; + wait_queue_head_t pending_writers; + wait_queue_head_t pending_readers; + }; + +-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock); +-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock); ++void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); + void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); + bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); + void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); +diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h +index de3e18bce24a..00328c856be6 100644 +--- a/fs/btrfs/lru_cache.h ++++ b/fs/btrfs/lru_cache.h +@@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca + return cache->size; + } + +-static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache) +-{ +- return cache->size >= cache->max_size; +-} +- + static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry( + struct btrfs_lru_cache *cache) + { +diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c +index 71f6d8302d50..3a095b9c6373 100644 +--- a/fs/btrfs/lzo.c ++++ b/fs/btrfs/lzo.c +@@ -17,6 +17,7 @@ + #include "compression.h" + #include "ctree.h" + #include "super.h" ++#include "btrfs_inode.h" + + #define LZO_LEN 4 + +@@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, + int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + { + struct workspace *workspace = list_entry(ws, struct workspace, list); +- const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); ++ const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + char *kaddr; + int ret; +@@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + */ + btrfs_err(fs_info, "unexpectedly large lzo segment len %u", + seg_len); +- ret = -EIO; +- goto out; ++ return -EIO; + } + + /* Copy the compressed segment payload into workspace */ +@@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + workspace->buf, &out_len); + if (ret != LZO_E_OK) { + btrfs_err(fs_info, "failed to decompress"); +- ret = -EIO; +- goto out; ++ return -EIO; + } + + /* Copy the data into inode pages */ +@@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + + /* All data read, exit */ + if (ret == 0) +- goto out; ++ return 0; + ret = 0; + + /* Check if the sector has enough space for a segment header */ +@@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + /* Skip the padding zeros */ + cur_in += sector_bytes_left; + } +-out: +- if (!ret) +- zero_fill_bio(cb->orig_bio); +- return ret; ++ ++ return 0; + } + + int lzo_decompress(struct list_head *ws, const u8 *data_in, +diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c +index fde5aaa6e7c9..310a05cf95ef 100644 +--- a/fs/btrfs/messages.c ++++ b/fs/btrfs/messages.c +@@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, + #endif + + #ifdef CONFIG_BTRFS_ASSERT +-void __cold btrfs_assertfail(const char *expr, const char *file, int line) ++void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line) + { + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h +index 8c516ee58ff9..ac2d1982ba3d 100644 +--- a/fs/btrfs/messages.h ++++ b/fs/btrfs/messages.h +@@ -160,7 +160,7 @@ do { \ + } while (0) + + #ifdef CONFIG_BTRFS_ASSERT +-void __cold btrfs_assertfail(const char *expr, const char *file, int line); ++void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line); + + #define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c +index 6c24b69e2d0a..a9778a91511e 100644 +--- a/fs/btrfs/ordered-data.c ++++ b/fs/btrfs/ordered-data.c +@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + * @compress_type: Compression algorithm used for data. + * + * Most of these parameters correspond to &struct btrfs_file_extent_item. The +- * tree is given a single reference on the ordered extent that was inserted. ++ * tree is given a single reference on the ordered extent that was inserted, and ++ * the returned pointer is given a second reference. + * +- * Return: 0 or -ENOMEM. ++ * Return: the new ordered extent or error pointer. + */ +-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, +- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, +- u64 disk_num_bytes, u64 offset, unsigned flags, +- int compress_type) ++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ++ struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type) + { + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; +@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + if (ret < 0) +- return ret; ++ return ERR_PTR(ret); + ret = 0; + } else { + /* +@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + if (ret < 0) +- return ret; ++ return ERR_PTR(ret); + } + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); + if (!entry) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + entry->file_offset = file_offset; + entry->num_bytes = num_bytes; +@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + ++ /* One ref for the returned entry to match semantics of lookup. */ ++ refcount_inc(&entry->refs); ++ ++ return entry; ++} ++ ++/* ++ * Add a new btrfs_ordered_extent for the range, but drop the reference instead ++ * of returning it to the caller. ++ */ ++int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type) ++{ ++ struct btrfs_ordered_extent *ordered; ++ ++ ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, ++ ram_bytes, disk_bytenr, ++ disk_num_bytes, offset, flags, ++ compress_type); ++ ++ if (IS_ERR(ordered)) ++ return PTR_ERR(ordered); ++ btrfs_put_ordered_extent(ordered); ++ + return 0; + } + +@@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + return false; + } + +- +-static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, +- u64 len) +-{ +- struct inode *inode = ordered->inode; +- struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; +- u64 file_offset = ordered->file_offset + pos; +- u64 disk_bytenr = ordered->disk_bytenr + pos; +- unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; +- +- /* +- * The splitting extent is already counted and will be added again in +- * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting. +- */ +- percpu_counter_add_batch(&fs_info->ordered_bytes, -len, +- fs_info->delalloc_batch); +- WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED)); +- return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, +- disk_bytenr, len, 0, flags, +- ordered->compress_type); +-} +- +-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, +- u64 post) ++/* Split out a new ordered extent for this first @len bytes of @ordered. */ ++int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len) + { + struct inode *inode = ordered->inode; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; +- struct rb_node *node; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); +- int ret = 0; ++ u64 file_offset = ordered->file_offset; ++ u64 disk_bytenr = ordered->disk_bytenr; ++ unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS; ++ struct rb_node *node; + + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + ++ ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED))); ++ ++ /* ++ * The entire bio must be covered by the ordered extent, but we can't ++ * reduce the original extent to a zero length either. ++ */ ++ if (WARN_ON_ONCE(len >= ordered->num_bytes)) ++ return -EINVAL; ++ /* We cannot split once ordered extent is past end_bio. */ ++ if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) ++ return -EINVAL; ++ /* We cannot split a compressed ordered extent. */ ++ if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) ++ return -EINVAL; ++ /* Checksum list should be empty. */ ++ if (WARN_ON_ONCE(!list_empty(&ordered->list))) ++ return -EINVAL; ++ + spin_lock_irq(&tree->lock); + /* Remove from tree once */ + node = &ordered->rb_node; +@@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + if (tree->last == node) + tree->last = NULL; + +- ordered->file_offset += pre; +- ordered->disk_bytenr += pre; +- ordered->num_bytes -= (pre + post); +- ordered->disk_num_bytes -= (pre + post); +- ordered->bytes_left -= (pre + post); ++ ordered->file_offset += len; ++ ordered->disk_bytenr += len; ++ ordered->num_bytes -= len; ++ ordered->disk_num_bytes -= len; ++ ordered->bytes_left -= len; + + /* Re-insert the node */ + node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); +@@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, + + spin_unlock_irq(&tree->lock); + +- if (pre) +- ret = clone_ordered_extent(ordered, 0, pre); +- if (ret == 0 && post) +- ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, +- post); ++ /* ++ * The splitting extent is already counted and will be added again in ++ * btrfs_add_ordered_extent(). Subtract len to avoid double counting. ++ */ ++ percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch); + +- return ret; ++ return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len, ++ disk_bytenr, len, 0, flags, ++ ordered->compress_type); + } + + int __init ordered_data_init(void) +diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h +index eb40cb39f842..f0f1138d23c3 100644 +--- a/fs/btrfs/ordered-data.h ++++ b/fs/btrfs/ordered-data.h +@@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size); ++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ++ struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type); + int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, +- u64 disk_num_bytes, u64 offset, unsigned flags, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); + void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +@@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, + struct extent_state **cached_state); + bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); +-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, +- u64 post); ++int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len); + int __init ordered_data_init(void); + void __cold ordered_data_exit(void); + +diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c +index 642828c1b299..2fab37f062de 100644 +--- a/fs/btrfs/raid56.c ++++ b/fs/btrfs/raid56.c +@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) + */ + static int rbio_bucket(struct btrfs_raid_bio *rbio) + { +- u64 num = rbio->bioc->raid_map[0]; ++ u64 num = rbio->bioc->full_stripe_logical; + + /* + * we shift down quite a bit. We're using byte +@@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) + static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) + { + struct btrfs_stripe_hash_table *table; +- unsigned long flags; + + if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) + return; + + table = rbio->bioc->fs_info->stripe_hash_table; + +- spin_lock_irqsave(&table->cache_lock, flags); ++ spin_lock(&table->cache_lock); + __remove_rbio_from_cache(rbio); +- spin_unlock_irqrestore(&table->cache_lock, flags); ++ spin_unlock(&table->cache_lock); + } + + /* +@@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) + static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) + { + struct btrfs_stripe_hash_table *table; +- unsigned long flags; + struct btrfs_raid_bio *rbio; + + table = info->stripe_hash_table; + +- spin_lock_irqsave(&table->cache_lock, flags); ++ spin_lock(&table->cache_lock); + while (!list_empty(&table->stripe_cache)) { + rbio = list_entry(table->stripe_cache.next, + struct btrfs_raid_bio, + stripe_cache); + __remove_rbio_from_cache(rbio); + } +- spin_unlock_irqrestore(&table->cache_lock, flags); ++ spin_unlock(&table->cache_lock); + } + + /* +@@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) + static void cache_rbio(struct btrfs_raid_bio *rbio) + { + struct btrfs_stripe_hash_table *table; +- unsigned long flags; + + if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) + return; + + table = rbio->bioc->fs_info->stripe_hash_table; + +- spin_lock_irqsave(&table->cache_lock, flags); ++ spin_lock(&table->cache_lock); + spin_lock(&rbio->bio_list_lock); + + /* bump our ref if we were not in the list before */ +@@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) + __remove_rbio_from_cache(found); + } + +- spin_unlock_irqrestore(&table->cache_lock, flags); ++ spin_unlock(&table->cache_lock); + } + + /* +@@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len) + */ + static int rbio_is_full(struct btrfs_raid_bio *rbio) + { +- unsigned long flags; + unsigned long size = rbio->bio_list_bytes; + int ret = 1; + +- spin_lock_irqsave(&rbio->bio_list_lock, flags); ++ spin_lock(&rbio->bio_list_lock); + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) + ret = 0; + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); +- spin_unlock_irqrestore(&rbio->bio_list_lock, flags); ++ spin_unlock(&rbio->bio_list_lock); + + return ret; + } +@@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, + test_bit(RBIO_CACHE_BIT, &cur->flags)) + return 0; + +- if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) ++ if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) + return 0; + + /* we can't merge with different operations */ +@@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) + struct btrfs_stripe_hash *h; + struct btrfs_raid_bio *cur; + struct btrfs_raid_bio *pending; +- unsigned long flags; + struct btrfs_raid_bio *freeit = NULL; + struct btrfs_raid_bio *cache_drop = NULL; + int ret = 0; + + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + +- spin_lock_irqsave(&h->lock, flags); ++ spin_lock(&h->lock); + list_for_each_entry(cur, &h->hash_list, hash_list) { +- if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) ++ if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) + continue; + + spin_lock(&cur->bio_list_lock); +@@ -724,7 +719,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) + refcount_inc(&rbio->refs); + list_add(&rbio->hash_list, &h->hash_list); + out: +- spin_unlock_irqrestore(&h->lock, flags); ++ spin_unlock(&h->lock); + if (cache_drop) + remove_rbio_from_cache(cache_drop); + if (freeit) +@@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) + { + int bucket; + struct btrfs_stripe_hash *h; +- unsigned long flags; + int keep_cache = 0; + + bucket = rbio_bucket(rbio); +@@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) + if (list_empty(&rbio->plug_list)) + cache_rbio(rbio); + +- spin_lock_irqsave(&h->lock, flags); ++ spin_lock(&h->lock); + spin_lock(&rbio->bio_list_lock); + + if (!list_empty(&rbio->hash_list)) { +@@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) + list_add(&next->hash_list, &h->hash_list); + refcount_inc(&next->refs); + spin_unlock(&rbio->bio_list_lock); +- spin_unlock_irqrestore(&h->lock, flags); ++ spin_unlock(&h->lock); + + if (next->operation == BTRFS_RBIO_READ_REBUILD) + start_async_work(next, recover_rbio_work_locked); +@@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) + } + done: + spin_unlock(&rbio->bio_list_lock); +- spin_unlock_irqrestore(&h->lock, flags); ++ spin_unlock(&h->lock); + + done_nolock: + if (!keep_cache) +@@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); + +- spin_lock_irq(&rbio->bio_list_lock); ++ spin_lock(&rbio->bio_list_lock); + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; +- spin_unlock_irq(&rbio->bio_list_lock); ++ spin_unlock(&rbio->bio_list_lock); + return sector; + } +- spin_unlock_irq(&rbio->bio_list_lock); ++ spin_unlock(&rbio->bio_list_lock); + + return &rbio->stripe_sectors[index]; + } +@@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, + struct btrfs_io_context *bioc) + { +- const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; ++ const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = +@@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_private = rbio; + +- bio_add_page(bio, sector->page, sectorsize, sector->pgoff); ++ __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + bio_list_add(bio_list, bio); + return 0; + } +@@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - +- rbio->bioc->raid_map[0]; ++ rbio->bioc->full_stripe_logical; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; +@@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) + { + struct bio *bio; + +- spin_lock_irq(&rbio->bio_list_lock); ++ spin_lock(&rbio->bio_list_lock); + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); + +- spin_unlock_irq(&rbio->bio_list_lock); ++ spin_unlock(&rbio->bio_list_lock); + } + + static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, +@@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + goto error; + } + +- if (likely(!rbio->bioc->num_tgtdevs)) ++ if (likely(!rbio->bioc->replace_nr_stripes)) + return 0; + +- /* Make a copy for the replace target device. */ ++ /* ++ * Make a copy for the replace target device. ++ * ++ * Thus the source stripe number (in replace_stripe_src) should be valid. ++ */ ++ ASSERT(rbio->bioc->replace_stripe_src >= 0); ++ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; +@@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + +- if (!rbio->bioc->tgtdev_map[stripe]) { ++ /* ++ * For RAID56, there is only one device that can be replaced, ++ * and replace_stripe_src[0] indicates the stripe number we ++ * need to copy from. ++ */ ++ if (stripe != rbio->bioc->replace_stripe_src) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. +@@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + } + + ret = rbio_add_io_sector(rbio, bio_list, sector, +- rbio->bioc->tgtdev_map[stripe], ++ rbio->real_stripes, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; +@@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) + { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - +- rbio->bioc->raid_map[0]; ++ rbio->bioc->full_stripe_logical; + int total_nr_sector = offset >> fs_info->sectorsize_bits; + + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); +@@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) + { + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; +- const u64 full_stripe_start = rbio->bioc->raid_map[0]; ++ const u64 full_stripe_start = rbio->bioc->full_stripe_logical; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; +@@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + * here due to a crc mismatch and we can't give them the + * data they want. + */ +- if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { +- if (rbio->bioc->raid_map[faila] == +- RAID5_P_STRIPE) ++ if (failb == rbio->real_stripes - 1) { ++ if (faila == rbio->real_stripes - 2) + /* + * Only P and Q are corrupted. + * We only care about data stripes recovery, +@@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + goto pstripe; + } + +- if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { ++ if (failb == rbio->real_stripes - 2) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); + } else { +@@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) + + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { +- spin_lock_irq(&rbio->bio_list_lock); ++ spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +- spin_unlock_irq(&rbio->bio_list_lock); ++ spin_unlock(&rbio->bio_list_lock); + } + + index_rbio_pages(rbio); +@@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) + { + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, +- rbio->bioc->raid_map[0]); +- const u64 start = rbio->bioc->raid_map[0]; ++ rbio->bioc->full_stripe_logical); ++ const u64 start = rbio->bioc->full_stripe_logical; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; +@@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, +- rbio->csum_buf, rbio->csum_bitmap); ++ rbio->csum_buf, rbio->csum_bitmap, false); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) +@@ -2124,7 +2128,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) + */ + btrfs_warn_rl(fs_info, + "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", +- rbio->bioc->raid_map[0], ret); ++ rbio->bioc->full_stripe_logical, ret); + no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); +@@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio) + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ +- spin_lock_irq(&rbio->bio_list_lock); ++ spin_lock(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); +- spin_unlock_irq(&rbio->bio_list_lock); ++ spin_unlock(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + +@@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + return rbio; + } + +-/* Used for both parity scrub and missing. */ +-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, +- unsigned int pgoff, u64 logical) +-{ +- const u32 sectorsize = rbio->bioc->fs_info->sectorsize; +- int stripe_offset; +- int index; +- +- ASSERT(logical >= rbio->bioc->raid_map[0]); +- ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + +- BTRFS_STRIPE_LEN * rbio->nr_data); +- stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); +- index = stripe_offset / sectorsize; +- rbio->bio_sectors[index].page = page; +- rbio->bio_sectors[index].pgoff = pgoff; +-} +- + /* + * We just scrub the parity that we have correct data on the same horizontal, + * so we needn't allocate all pages for all the stripes. +@@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) + else + BUG(); + +- if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { ++ /* ++ * Replace is running and our P/Q stripe is being replaced, then we ++ * need to duplicate the final write to replace target. ++ */ ++ if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { + is_replace = 1; + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); + } +@@ -2544,13 +2535,18 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) + if (!is_replace) + goto submit_write; + ++ /* ++ * Replace is running and our parity stripe needs to be duplicated to ++ * the target device. Check we have a valid source stripe number. ++ */ ++ ASSERT(rbio->bioc->replace_stripe_src >= 0); + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; + + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, +- bioc->tgtdev_map[rbio->scrubp], +- sectornr, REQ_OP_WRITE); ++ rbio->real_stripes, ++ sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; + } +@@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) + if (!lock_stripe_add(rbio)) + start_async_work(rbio, scrub_rbio_work_locked); + } +- +-/* The following code is used for dev replace of a missing RAID 5/6 device. */ +- +-struct btrfs_raid_bio * +-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) +-{ +- struct btrfs_fs_info *fs_info = bioc->fs_info; +- struct btrfs_raid_bio *rbio; +- +- rbio = alloc_rbio(fs_info, bioc); +- if (IS_ERR(rbio)) +- return NULL; +- +- rbio->operation = BTRFS_RBIO_REBUILD_MISSING; +- bio_list_add(&rbio->bio_list, bio); +- /* +- * This is a special bio which is used to hold the completion handler +- * and make the scrub rbio is similar to the other types +- */ +- ASSERT(!bio->bi_iter.bi_size); +- +- set_rbio_range_error(rbio, bio); +- +- return rbio; +-} +- +-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) +-{ +- start_async_work(rbio, recover_rbio_work); +-} +diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h +index df0e0abdeb1f..0f7f31c8cb98 100644 +--- a/fs/btrfs/raid56.h ++++ b/fs/btrfs/raid56.h +@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map) + return map->num_stripes - btrfs_nr_parity_stripes(map->type); + } + ++static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc) ++{ ++ return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type); ++} ++ + #define RAID5_P_STRIPE ((u64)-2) + #define RAID6_Q_STRIPE ((u64)-1) + +@@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num); + void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); + +-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, +- unsigned int pgoff, u64 logical); +- + struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); + void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); + +-struct btrfs_raid_bio * +-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); +-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); +- + int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); + void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); + +diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c +index ef13a9d4e370..09b1988d1791 100644 +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -1266,7 +1266,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, + level = btrfs_header_level(parent); + ASSERT(level >= lowest_level); + +- ret = btrfs_bin_search(parent, &key, &slot); ++ ret = btrfs_bin_search(parent, 0, &key, &slot); + if (ret < 0) + break; + if (ret && slot > 0) +@@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, + + if (upper->eb && !upper->locked) { + if (!lowest) { +- ret = btrfs_bin_search(upper->eb, key, &slot); ++ ret = btrfs_bin_search(upper->eb, 0, key, &slot); + if (ret < 0) + goto next; + BUG_ON(ret); +@@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, + slot = path->slots[upper->level]; + btrfs_release_path(path); + } else { +- ret = btrfs_bin_search(upper->eb, key, &slot); ++ ret = btrfs_bin_search(upper->eb, 0, key, &slot); + if (ret < 0) + goto next; + BUG_ON(ret); +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 69c93ae333f6..836725a19661 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -38,18 +38,14 @@ + * - add a mode to also read unallocated space + */ + +-struct scrub_block; + struct scrub_ctx; + + /* +- * The following three values only influence the performance. ++ * The following value only influences the performance. + * +- * The last one configures the number of parallel and outstanding I/O +- * operations. The first one configures an upper limit for the number +- * of (dynamically allocated) pages that are added to a bio. ++ * This determines the batch size for stripe submitted in one go. + */ +-#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ +-#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ ++#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ + + /* + * The following value times PAGE_SIZE needs to be large enough to match the +@@ -57,128 +53,124 @@ struct scrub_ctx; + */ + #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) + +-#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) ++/* Represent one sector and its needed info to verify the content. */ ++struct scrub_sector_verification { ++ bool is_metadata; + +-/* +- * Maximum number of mirrors that can be available for all profiles counting +- * the target device of dev-replace as one. During an active device replace +- * procedure, the target device of the copy operation is a mirror for the +- * filesystem data as well that can be used to read data in order to repair +- * read errors on other disks. +- * +- * Current value is derived from RAID1C4 with 4 copies. +- */ +-#define BTRFS_MAX_MIRRORS (4 + 1) ++ union { ++ /* ++ * Csum pointer for data csum verification. Should point to a ++ * sector csum inside scrub_stripe::csums. ++ * ++ * NULL if this data sector has no csum. ++ */ ++ u8 *csum; + +-struct scrub_recover { +- refcount_t refs; +- struct btrfs_io_context *bioc; +- u64 map_length; ++ /* ++ * Extra info for metadata verification. All sectors inside a ++ * tree block share the same generation. ++ */ ++ u64 generation; ++ }; + }; + +-struct scrub_sector { +- struct scrub_block *sblock; +- struct list_head list; +- u64 flags; /* extent flags */ +- u64 generation; +- /* Offset in bytes to @sblock. */ +- u32 offset; +- atomic_t refs; +- unsigned int have_csum:1; +- unsigned int io_error:1; +- u8 csum[BTRFS_CSUM_SIZE]; +- +- struct scrub_recover *recover; +-}; ++enum scrub_stripe_flags { ++ /* Set when @mirror_num, @dev, @physical and @logical are set. */ ++ SCRUB_STRIPE_FLAG_INITIALIZED, + +-struct scrub_bio { +- int index; +- struct scrub_ctx *sctx; +- struct btrfs_device *dev; +- struct bio *bio; +- blk_status_t status; +- u64 logical; +- u64 physical; +- struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; +- int sector_count; +- int next_free; +- struct work_struct work; +-}; ++ /* Set when the read-repair is finished. */ ++ SCRUB_STRIPE_FLAG_REPAIR_DONE, + +-struct scrub_block { + /* +- * Each page will have its page::private used to record the logical +- * bytenr. ++ * Set for data stripes if it's triggered from P/Q stripe. ++ * During such scrub, we should not report errors in data stripes, nor ++ * update the accounting. + */ +- struct page *pages[SCRUB_MAX_PAGES]; +- struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; +- struct btrfs_device *dev; +- /* Logical bytenr of the sblock */ +- u64 logical; +- u64 physical; +- u64 physical_for_dev_replace; +- /* Length of sblock in bytes */ +- u32 len; +- int sector_count; +- int mirror_num; +- +- atomic_t outstanding_sectors; +- refcount_t refs; /* free mem on transition to zero */ +- struct scrub_ctx *sctx; +- struct scrub_parity *sparity; +- struct { +- unsigned int header_error:1; +- unsigned int checksum_error:1; +- unsigned int no_io_error_seen:1; +- unsigned int generation_error:1; /* also sets header_error */ +- +- /* The following is for the data used to check parity */ +- /* It is for the data with checksum */ +- unsigned int data_corrected:1; +- }; +- struct work_struct work; ++ SCRUB_STRIPE_FLAG_NO_REPORT, + }; + +-/* Used for the chunks with parity stripe such RAID5/6 */ +-struct scrub_parity { +- struct scrub_ctx *sctx; ++#define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) ++ ++/* ++ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN. ++ */ ++struct scrub_stripe { ++ struct scrub_ctx *sctx; ++ struct btrfs_block_group *bg; ++ ++ struct page *pages[SCRUB_STRIPE_PAGES]; ++ struct scrub_sector_verification *sectors; ++ ++ struct btrfs_device *dev; ++ u64 logical; ++ u64 physical; + +- struct btrfs_device *scrub_dev; ++ u16 mirror_num; + +- u64 logic_start; ++ /* Should be BTRFS_STRIPE_LEN / sectorsize. */ ++ u16 nr_sectors; ++ ++ /* ++ * How many data/meta extents are in this stripe. Only for scrub status ++ * reporting purposes. ++ */ ++ u16 nr_data_extents; ++ u16 nr_meta_extents; + +- u64 logic_end; ++ atomic_t pending_io; ++ wait_queue_head_t io_wait; ++ wait_queue_head_t repair_wait; + +- int nsectors; ++ /* ++ * Indicate the states of the stripe. Bits are defined in ++ * scrub_stripe_flags enum. ++ */ ++ unsigned long state; + +- u32 stripe_len; ++ /* Indicate which sectors are covered by extent items. */ ++ unsigned long extent_sector_bitmap; + +- refcount_t refs; ++ /* ++ * The errors hit during the initial read of the stripe. ++ * ++ * Would be utilized for error reporting and repair. ++ */ ++ unsigned long init_error_bitmap; + +- struct list_head sectors_list; ++ /* ++ * The following error bitmaps are all for the current status. ++ * Every time we submit a new read, these bitmaps may be updated. ++ * ++ * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; ++ * ++ * IO and csum errors can happen for both metadata and data. ++ */ ++ unsigned long error_bitmap; ++ unsigned long io_error_bitmap; ++ unsigned long csum_error_bitmap; ++ unsigned long meta_error_bitmap; + +- /* Work of parity check and repair */ +- struct work_struct work; ++ /* For writeback (repair or replace) error reporting. */ ++ unsigned long write_error_bitmap; + +- /* Mark the parity blocks which have data */ +- unsigned long dbitmap; ++ /* Writeback can be concurrent, thus we need to protect the bitmap. */ ++ spinlock_t write_error_lock; + + /* +- * Mark the parity blocks which have data, but errors happen when +- * read data or check data ++ * Checksum for the whole stripe if this stripe is inside a data block ++ * group. + */ +- unsigned long ebitmap; ++ u8 *csums; ++ ++ struct work_struct work; + }; + + struct scrub_ctx { +- struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; ++ struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; ++ struct scrub_stripe *raid56_data_stripes; + struct btrfs_fs_info *fs_info; + int first_free; +- int curr; +- atomic_t bios_in_flight; +- atomic_t workers_pending; +- spinlock_t list_lock; +- wait_queue_head_t list_wait; ++ int cur_stripe; + struct list_head csum_list; + atomic_t cancel_req; + int readonly; +@@ -191,10 +183,8 @@ struct scrub_ctx { + int is_dev_replace; + u64 write_pointer; + +- struct scrub_bio *wr_curr_bio; + struct mutex wr_lock; + struct btrfs_device *wr_tgtdev; +- bool flush_all_writes; + + /* + * statistics +@@ -221,239 +211,66 @@ struct scrub_warning { + struct btrfs_device *dev; + }; + +-struct full_stripe_lock { +- struct rb_node node; +- u64 logical; +- u64 refs; +- struct mutex mutex; +-}; +- +-#ifndef CONFIG_64BIT +-/* This structure is for architectures whose (void *) is smaller than u64 */ +-struct scrub_page_private { +- u64 logical; +-}; +-#endif +- +-static int attach_scrub_page_private(struct page *page, u64 logical) +-{ +-#ifdef CONFIG_64BIT +- attach_page_private(page, (void *)logical); +- return 0; +-#else +- struct scrub_page_private *spp; +- +- spp = kmalloc(sizeof(*spp), GFP_KERNEL); +- if (!spp) +- return -ENOMEM; +- spp->logical = logical; +- attach_page_private(page, (void *)spp); +- return 0; +-#endif +-} +- +-static void detach_scrub_page_private(struct page *page) +-{ +-#ifdef CONFIG_64BIT +- detach_page_private(page); +- return; +-#else +- struct scrub_page_private *spp; +- +- spp = detach_page_private(page); +- kfree(spp); +- return; +-#endif +-} +- +-static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, +- struct btrfs_device *dev, +- u64 logical, u64 physical, +- u64 physical_for_dev_replace, +- int mirror_num) +-{ +- struct scrub_block *sblock; +- +- sblock = kzalloc(sizeof(*sblock), GFP_KERNEL); +- if (!sblock) +- return NULL; +- refcount_set(&sblock->refs, 1); +- sblock->sctx = sctx; +- sblock->logical = logical; +- sblock->physical = physical; +- sblock->physical_for_dev_replace = physical_for_dev_replace; +- sblock->dev = dev; +- sblock->mirror_num = mirror_num; +- sblock->no_io_error_seen = 1; +- /* +- * Scrub_block::pages will be allocated at alloc_scrub_sector() when +- * the corresponding page is not allocated. +- */ +- return sblock; +-} +- +-/* +- * Allocate a new scrub sector and attach it to @sblock. +- * +- * Will also allocate new pages for @sblock if needed. +- */ +-static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, +- u64 logical) ++static void release_scrub_stripe(struct scrub_stripe *stripe) + { +- const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; +- struct scrub_sector *ssector; +- +- /* We must never have scrub_block exceed U32_MAX in size. */ +- ASSERT(logical - sblock->logical < U32_MAX); +- +- ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); +- if (!ssector) +- return NULL; +- +- /* Allocate a new page if the slot is not allocated */ +- if (!sblock->pages[page_index]) { +- int ret; ++ if (!stripe) ++ return; + +- sblock->pages[page_index] = alloc_page(GFP_KERNEL); +- if (!sblock->pages[page_index]) { +- kfree(ssector); +- return NULL; +- } +- ret = attach_scrub_page_private(sblock->pages[page_index], +- sblock->logical + (page_index << PAGE_SHIFT)); +- if (ret < 0) { +- kfree(ssector); +- __free_page(sblock->pages[page_index]); +- sblock->pages[page_index] = NULL; +- return NULL; +- } ++ for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) { ++ if (stripe->pages[i]) ++ __free_page(stripe->pages[i]); ++ stripe->pages[i] = NULL; + } +- +- atomic_set(&ssector->refs, 1); +- ssector->sblock = sblock; +- /* The sector to be added should not be used */ +- ASSERT(sblock->sectors[sblock->sector_count] == NULL); +- ssector->offset = logical - sblock->logical; +- +- /* The sector count must be smaller than the limit */ +- ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK); +- +- sblock->sectors[sblock->sector_count] = ssector; +- sblock->sector_count++; +- sblock->len += sblock->sctx->fs_info->sectorsize; +- +- return ssector; +-} +- +-static struct page *scrub_sector_get_page(struct scrub_sector *ssector) +-{ +- struct scrub_block *sblock = ssector->sblock; +- pgoff_t index; +- /* +- * When calling this function, ssector must be alreaday attached to the +- * parent sblock. +- */ +- ASSERT(sblock); +- +- /* The range should be inside the sblock range */ +- ASSERT(ssector->offset < sblock->len); +- +- index = ssector->offset >> PAGE_SHIFT; +- ASSERT(index < SCRUB_MAX_PAGES); +- ASSERT(sblock->pages[index]); +- ASSERT(PagePrivate(sblock->pages[index])); +- return sblock->pages[index]; ++ kfree(stripe->sectors); ++ kfree(stripe->csums); ++ stripe->sectors = NULL; ++ stripe->csums = NULL; ++ stripe->sctx = NULL; ++ stripe->state = 0; + } + +-static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector) ++static int init_scrub_stripe(struct btrfs_fs_info *fs_info, ++ struct scrub_stripe *stripe) + { +- struct scrub_block *sblock = ssector->sblock; ++ int ret; + +- /* +- * When calling this function, ssector must be already attached to the +- * parent sblock. +- */ +- ASSERT(sblock); ++ memset(stripe, 0, sizeof(*stripe)); + +- /* The range should be inside the sblock range */ +- ASSERT(ssector->offset < sblock->len); ++ stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; ++ stripe->state = 0; + +- return offset_in_page(ssector->offset); +-} ++ init_waitqueue_head(&stripe->io_wait); ++ init_waitqueue_head(&stripe->repair_wait); ++ atomic_set(&stripe->pending_io, 0); ++ spin_lock_init(&stripe->write_error_lock); + +-static char *scrub_sector_get_kaddr(struct scrub_sector *ssector) +-{ +- return page_address(scrub_sector_get_page(ssector)) + +- scrub_sector_get_page_offset(ssector); ++ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); ++ if (ret < 0) ++ goto error; ++ ++ stripe->sectors = kcalloc(stripe->nr_sectors, ++ sizeof(struct scrub_sector_verification), ++ GFP_KERNEL); ++ if (!stripe->sectors) ++ goto error; ++ ++ stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits, ++ fs_info->csum_size, GFP_KERNEL); ++ if (!stripe->csums) ++ goto error; ++ return 0; ++error: ++ release_scrub_stripe(stripe); ++ return -ENOMEM; + } + +-static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector, +- unsigned int len) ++static void wait_scrub_stripe_io(struct scrub_stripe *stripe) + { +- return bio_add_page(bio, scrub_sector_get_page(ssector), len, +- scrub_sector_get_page_offset(ssector)); ++ wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0); + } + +-static int scrub_setup_recheck_block(struct scrub_block *original_sblock, +- struct scrub_block *sblocks_for_recheck[]); +-static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +- struct scrub_block *sblock, +- int retry_failed_mirror); +-static void scrub_recheck_block_checksum(struct scrub_block *sblock); +-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, +- struct scrub_block *sblock_good); +-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, +- struct scrub_block *sblock_good, +- int sector_num, int force_write); +-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); +-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, +- int sector_num); +-static int scrub_checksum_data(struct scrub_block *sblock); +-static int scrub_checksum_tree_block(struct scrub_block *sblock); +-static int scrub_checksum_super(struct scrub_block *sblock); +-static void scrub_block_put(struct scrub_block *sblock); +-static void scrub_sector_get(struct scrub_sector *sector); +-static void scrub_sector_put(struct scrub_sector *sector); +-static void scrub_parity_get(struct scrub_parity *sparity); +-static void scrub_parity_put(struct scrub_parity *sparity); +-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, +- u64 physical, struct btrfs_device *dev, u64 flags, +- u64 gen, int mirror_num, u8 *csum, +- u64 physical_for_dev_replace); +-static void scrub_bio_end_io(struct bio *bio); +-static void scrub_bio_end_io_worker(struct work_struct *work); +-static void scrub_block_complete(struct scrub_block *sblock); +-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, +- u64 extent_logical, u32 extent_len, +- u64 *extent_physical, +- struct btrfs_device **extent_dev, +- int *extent_mirror_num); +-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, +- struct scrub_sector *sector); +-static void scrub_wr_submit(struct scrub_ctx *sctx); +-static void scrub_wr_bio_end_io(struct bio *bio); +-static void scrub_wr_bio_end_io_worker(struct work_struct *work); + static void scrub_put_ctx(struct scrub_ctx *sctx); + +-static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) +-{ +- return sector->recover && +- (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); +-} +- +-static void scrub_pending_bio_inc(struct scrub_ctx *sctx) +-{ +- refcount_inc(&sctx->refs); +- atomic_inc(&sctx->bios_in_flight); +-} +- +-static void scrub_pending_bio_dec(struct scrub_ctx *sctx) +-{ +- atomic_dec(&sctx->bios_in_flight); +- wake_up(&sctx->list_wait); +- scrub_put_ctx(sctx); +-} +- + static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) + { + while (atomic_read(&fs_info->scrub_pause_req)) { +@@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) + scrub_pause_off(fs_info); + } + +-/* +- * Insert new full stripe lock into full stripe locks tree +- * +- * Return pointer to existing or newly inserted full_stripe_lock structure if +- * everything works well. +- * Return ERR_PTR(-ENOMEM) if we failed to allocate memory +- * +- * NOTE: caller must hold full_stripe_locks_root->lock before calling this +- * function +- */ +-static struct full_stripe_lock *insert_full_stripe_lock( +- struct btrfs_full_stripe_locks_tree *locks_root, +- u64 fstripe_logical) +-{ +- struct rb_node **p; +- struct rb_node *parent = NULL; +- struct full_stripe_lock *entry; +- struct full_stripe_lock *ret; +- +- lockdep_assert_held(&locks_root->lock); +- +- p = &locks_root->root.rb_node; +- while (*p) { +- parent = *p; +- entry = rb_entry(parent, struct full_stripe_lock, node); +- if (fstripe_logical < entry->logical) { +- p = &(*p)->rb_left; +- } else if (fstripe_logical > entry->logical) { +- p = &(*p)->rb_right; +- } else { +- entry->refs++; +- return entry; +- } +- } +- +- /* +- * Insert new lock. +- */ +- ret = kmalloc(sizeof(*ret), GFP_KERNEL); +- if (!ret) +- return ERR_PTR(-ENOMEM); +- ret->logical = fstripe_logical; +- ret->refs = 1; +- mutex_init(&ret->mutex); +- +- rb_link_node(&ret->node, parent, p); +- rb_insert_color(&ret->node, &locks_root->root); +- return ret; +-} +- +-/* +- * Search for a full stripe lock of a block group +- * +- * Return pointer to existing full stripe lock if found +- * Return NULL if not found +- */ +-static struct full_stripe_lock *search_full_stripe_lock( +- struct btrfs_full_stripe_locks_tree *locks_root, +- u64 fstripe_logical) +-{ +- struct rb_node *node; +- struct full_stripe_lock *entry; +- +- lockdep_assert_held(&locks_root->lock); +- +- node = locks_root->root.rb_node; +- while (node) { +- entry = rb_entry(node, struct full_stripe_lock, node); +- if (fstripe_logical < entry->logical) +- node = node->rb_left; +- else if (fstripe_logical > entry->logical) +- node = node->rb_right; +- else +- return entry; +- } +- return NULL; +-} +- +-/* +- * Helper to get full stripe logical from a normal bytenr. +- * +- * Caller must ensure @cache is a RAID56 block group. +- */ +-static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr) +-{ +- u64 ret; +- +- /* +- * Due to chunk item size limit, full stripe length should not be +- * larger than U32_MAX. Just a sanity check here. +- */ +- WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); +- +- /* +- * round_down() can only handle power of 2, while RAID56 full +- * stripe length can be 64KiB * n, so we need to manually round down. +- */ +- ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) * +- cache->full_stripe_len + cache->start; +- return ret; +-} +- +-/* +- * Lock a full stripe to avoid concurrency of recovery and read +- * +- * It's only used for profiles with parities (RAID5/6), for other profiles it +- * does nothing. +- * +- * Return 0 if we locked full stripe covering @bytenr, with a mutex held. +- * So caller must call unlock_full_stripe() at the same context. +- * +- * Return <0 if encounters error. +- */ +-static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, +- bool *locked_ret) +-{ +- struct btrfs_block_group *bg_cache; +- struct btrfs_full_stripe_locks_tree *locks_root; +- struct full_stripe_lock *existing; +- u64 fstripe_start; +- int ret = 0; +- +- *locked_ret = false; +- bg_cache = btrfs_lookup_block_group(fs_info, bytenr); +- if (!bg_cache) { +- ASSERT(0); +- return -ENOENT; +- } +- +- /* Profiles not based on parity don't need full stripe lock */ +- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) +- goto out; +- locks_root = &bg_cache->full_stripe_locks_root; +- +- fstripe_start = get_full_stripe_logical(bg_cache, bytenr); +- +- /* Now insert the full stripe lock */ +- mutex_lock(&locks_root->lock); +- existing = insert_full_stripe_lock(locks_root, fstripe_start); +- mutex_unlock(&locks_root->lock); +- if (IS_ERR(existing)) { +- ret = PTR_ERR(existing); +- goto out; +- } +- mutex_lock(&existing->mutex); +- *locked_ret = true; +-out: +- btrfs_put_block_group(bg_cache); +- return ret; +-} +- +-/* +- * Unlock a full stripe. +- * +- * NOTE: Caller must ensure it's the same context calling corresponding +- * lock_full_stripe(). +- * +- * Return 0 if we unlock full stripe without problem. +- * Return <0 for error +- */ +-static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, +- bool locked) +-{ +- struct btrfs_block_group *bg_cache; +- struct btrfs_full_stripe_locks_tree *locks_root; +- struct full_stripe_lock *fstripe_lock; +- u64 fstripe_start; +- bool freeit = false; +- int ret = 0; +- +- /* If we didn't acquire full stripe lock, no need to continue */ +- if (!locked) +- return 0; +- +- bg_cache = btrfs_lookup_block_group(fs_info, bytenr); +- if (!bg_cache) { +- ASSERT(0); +- return -ENOENT; +- } +- if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) +- goto out; +- +- locks_root = &bg_cache->full_stripe_locks_root; +- fstripe_start = get_full_stripe_logical(bg_cache, bytenr); +- +- mutex_lock(&locks_root->lock); +- fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); +- /* Unpaired unlock_full_stripe() detected */ +- if (!fstripe_lock) { +- WARN_ON(1); +- ret = -ENOENT; +- mutex_unlock(&locks_root->lock); +- goto out; +- } +- +- if (fstripe_lock->refs == 0) { +- WARN_ON(1); +- btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", +- fstripe_lock->logical); +- } else { +- fstripe_lock->refs--; +- } +- +- if (fstripe_lock->refs == 0) { +- rb_erase(&fstripe_lock->node, &locks_root->root); +- freeit = true; +- } +- mutex_unlock(&locks_root->lock); +- +- mutex_unlock(&fstripe_lock->mutex); +- if (freeit) +- kfree(fstripe_lock); +-out: +- btrfs_put_block_group(bg_cache); +- return ret; +-} +- + static void scrub_free_csums(struct scrub_ctx *sctx) + { + while (!list_empty(&sctx->csum_list)) { +@@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) + if (!sctx) + return; + +- /* this can happen when scrub is cancelled */ +- if (sctx->curr != -1) { +- struct scrub_bio *sbio = sctx->bios[sctx->curr]; +- +- for (i = 0; i < sbio->sector_count; i++) +- scrub_block_put(sbio->sectors[i]->sblock); +- bio_put(sbio->bio); +- } +- +- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { +- struct scrub_bio *sbio = sctx->bios[i]; +- +- if (!sbio) +- break; +- kfree(sbio); +- } ++ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) ++ release_scrub_stripe(&sctx->stripes[i]); + +- kfree(sctx->wr_curr_bio); + scrub_free_csums(sctx); + kfree(sctx); + } +@@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( + goto nomem; + refcount_set(&sctx->refs, 1); + sctx->is_dev_replace = is_dev_replace; +- sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; +- sctx->curr = -1; + sctx->fs_info = fs_info; + INIT_LIST_HEAD(&sctx->csum_list); +- for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { +- struct scrub_bio *sbio; ++ for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { ++ int ret; + +- sbio = kzalloc(sizeof(*sbio), GFP_KERNEL); +- if (!sbio) ++ ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); ++ if (ret < 0) + goto nomem; +- sctx->bios[i] = sbio; +- +- sbio->index = i; +- sbio->sctx = sctx; +- sbio->sector_count = 0; +- INIT_WORK(&sbio->work, scrub_bio_end_io_worker); +- +- if (i != SCRUB_BIOS_PER_SCTX - 1) +- sctx->bios[i]->next_free = i + 1; +- else +- sctx->bios[i]->next_free = -1; ++ sctx->stripes[i].sctx = sctx; + } + sctx->first_free = 0; +- atomic_set(&sctx->bios_in_flight, 0); +- atomic_set(&sctx->workers_pending, 0); + atomic_set(&sctx->cancel_req, 0); + +- spin_lock_init(&sctx->list_lock); + spin_lock_init(&sctx->stat_lock); +- init_waitqueue_head(&sctx->list_wait); + sctx->throttle_deadline = 0; + +- WARN_ON(sctx->wr_curr_bio != NULL); + mutex_init(&sctx->wr_lock); +- sctx->wr_curr_bio = NULL; + if (is_dev_replace) { + WARN_ON(!fs_info->dev_replace.tgtdev); + sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; +- sctx->flush_all_writes = false; + } + + return sctx; +@@ -898,10 +464,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + return 0; + } + +-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) ++static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev, ++ bool is_super, u64 logical, u64 physical) + { +- struct btrfs_device *dev; +- struct btrfs_fs_info *fs_info; ++ struct btrfs_fs_info *fs_info = dev->fs_info; + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *eb; +@@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) + u8 ref_level = 0; + int ret; + +- WARN_ON(sblock->sector_count < 1); +- dev = sblock->dev; +- fs_info = sblock->sctx->fs_info; +- + /* Super block error, no need to search extent tree. */ +- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { ++ if (is_super) { + btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", +- errstr, btrfs_dev_name(dev), sblock->physical); ++ errstr, btrfs_dev_name(dev), physical); + return; + } + path = btrfs_alloc_path(); + if (!path) + return; + +- swarn.physical = sblock->physical; +- swarn.logical = sblock->logical; ++ swarn.physical = physical; ++ swarn.logical = logical; + swarn.errstr = errstr; + swarn.dev = NULL; + +@@ -978,1921 +540,717 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) + btrfs_free_path(path); + } + +-static inline void scrub_get_recover(struct scrub_recover *recover) ++static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) + { +- refcount_inc(&recover->refs); ++ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) ++ return 2; ++ else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) ++ return 3; ++ else ++ return (int)bioc->num_stripes; + } + +-static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, +- struct scrub_recover *recover) ++static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, ++ u64 full_stripe_logical, ++ int nstripes, int mirror, ++ int *stripe_index, ++ u64 *stripe_offset) + { +- if (refcount_dec_and_test(&recover->refs)) { +- btrfs_bio_counter_dec(fs_info); +- btrfs_put_bioc(recover->bioc); +- kfree(recover); ++ int i; ++ ++ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ++ const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ? ++ nstripes - 1 : nstripes - 2; ++ ++ /* RAID5/6 */ ++ for (i = 0; i < nr_data_stripes; i++) { ++ const u64 data_stripe_start = full_stripe_logical + ++ (i * BTRFS_STRIPE_LEN); ++ ++ if (logical >= data_stripe_start && ++ logical < data_stripe_start + BTRFS_STRIPE_LEN) ++ break; ++ } ++ ++ *stripe_index = i; ++ *stripe_offset = (logical - full_stripe_logical) & ++ BTRFS_STRIPE_LEN_MASK; ++ } else { ++ /* The other RAID type */ ++ *stripe_index = mirror; ++ *stripe_offset = 0; + } + } + +-/* +- * scrub_handle_errored_block gets called when either verification of the +- * sectors failed or the bio failed to read, e.g. with EIO. In the latter +- * case, this function handles all sectors in the bio, even though only one +- * may be bad. +- * The goal of this function is to repair the errored block by using the +- * contents of one of the mirrors. +- */ +-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) ++static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) + { +- struct scrub_ctx *sctx = sblock_to_check->sctx; +- struct btrfs_device *dev = sblock_to_check->dev; +- struct btrfs_fs_info *fs_info; +- u64 logical; +- unsigned int failed_mirror_index; +- unsigned int is_metadata; +- unsigned int have_csum; +- /* One scrub_block for each mirror */ +- struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 }; +- struct scrub_block *sblock_bad; +- int ret; +- int mirror_index; +- int sector_num; +- int success; +- bool full_stripe_locked; +- unsigned int nofs_flag; +- static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, +- DEFAULT_RATELIMIT_BURST); ++ int ret = 0; ++ u64 length; + +- BUG_ON(sblock_to_check->sector_count < 1); +- fs_info = sctx->fs_info; +- if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { +- /* +- * If we find an error in a super block, we just report it. +- * They will get written with the next transaction commit +- * anyway +- */ +- scrub_print_warning("super block error", sblock_to_check); +- spin_lock(&sctx->stat_lock); +- ++sctx->stat.super_errors; +- spin_unlock(&sctx->stat_lock); +- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); ++ if (!btrfs_is_zoned(sctx->fs_info)) + return 0; +- } +- logical = sblock_to_check->logical; +- ASSERT(sblock_to_check->mirror_num); +- failed_mirror_index = sblock_to_check->mirror_num - 1; +- is_metadata = !(sblock_to_check->sectors[0]->flags & +- BTRFS_EXTENT_FLAG_DATA); +- have_csum = sblock_to_check->sectors[0]->have_csum; +- +- if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) ++ ++ if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) + return 0; + +- /* +- * We must use GFP_NOFS because the scrub task might be waiting for a +- * worker task executing this function and in turn a transaction commit +- * might be waiting the scrub task to pause (which needs to wait for all +- * the worker tasks to complete before pausing). +- * We do allocations in the workers through insert_full_stripe_lock() +- * and scrub_add_sector_to_wr_bio(), which happens down the call chain of +- * this function. +- */ +- nofs_flag = memalloc_nofs_save(); +- /* +- * For RAID5/6, race can happen for a different device scrub thread. +- * For data corruption, Parity and Data threads will both try +- * to recovery the data. +- * Race can lead to doubly added csum error, or even unrecoverable +- * error. +- */ +- ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); +- if (ret < 0) { +- memalloc_nofs_restore(nofs_flag); +- spin_lock(&sctx->stat_lock); +- if (ret == -ENOMEM) +- sctx->stat.malloc_errors++; +- sctx->stat.read_errors++; +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- return ret; ++ if (sctx->write_pointer < physical) { ++ length = physical - sctx->write_pointer; ++ ++ ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, ++ sctx->write_pointer, length); ++ if (!ret) ++ sctx->write_pointer = physical; + } ++ return ret; ++} + +- /* +- * read all mirrors one after the other. This includes to +- * re-read the extent or metadata block that failed (that was +- * the cause that this fixup code is called) another time, +- * sector by sector this time in order to know which sectors +- * caused I/O errors and which ones are good (for all mirrors). +- * It is the goal to handle the situation when more than one +- * mirror contains I/O errors, but the errors do not +- * overlap, i.e. the data can be repaired by selecting the +- * sectors from those mirrors without I/O error on the +- * particular sectors. One example (with blocks >= 2 * sectorsize) +- * would be that mirror #1 has an I/O error on the first sector, +- * the second sector is good, and mirror #2 has an I/O error on +- * the second sector, but the first sector is good. +- * Then the first sector of the first mirror can be repaired by +- * taking the first sector of the second mirror, and the +- * second sector of the second mirror can be repaired by +- * copying the contents of the 2nd sector of the 1st mirror. +- * One more note: if the sectors of one mirror contain I/O +- * errors, the checksum cannot be verified. In order to get +- * the best data for repairing, the first attempt is to find +- * a mirror without I/O errors and with a validated checksum. +- * Only if this is not possible, the sectors are picked from +- * mirrors with I/O errors without considering the checksum. +- * If the latter is the case, at the end, the checksum of the +- * repaired area is verified in order to correctly maintain +- * the statistics. +- */ +- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { +- /* +- * Note: the two members refs and outstanding_sectors are not +- * used in the blocks that are used for the recheck procedure. +- * +- * But alloc_scrub_block() will initialize sblock::ref anyway, +- * so we can use scrub_block_put() to clean them up. +- * +- * And here we don't setup the physical/dev for the sblock yet, +- * they will be correctly initialized in scrub_setup_recheck_block(). +- */ +- sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL, +- logical, 0, 0, mirror_index); +- if (!sblocks_for_recheck[mirror_index]) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- sctx->stat.read_errors++; +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); +- goto out; +- } +- } +- +- /* Setup the context, map the logical blocks and alloc the sectors */ +- ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); +- if (ret) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.read_errors++; +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); +- goto out; +- } +- BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS); +- sblock_bad = sblocks_for_recheck[failed_mirror_index]; +- +- /* build and submit the bios for the failed mirror, check checksums */ +- scrub_recheck_block(fs_info, sblock_bad, 1); +- +- if (!sblock_bad->header_error && !sblock_bad->checksum_error && +- sblock_bad->no_io_error_seen) { +- /* +- * The error disappeared after reading sector by sector, or +- * the area was part of a huge bio and other parts of the +- * bio caused I/O errors, or the block layer merged several +- * read requests into one and the error is caused by a +- * different bio (usually one of the two latter cases is +- * the cause) +- */ +- spin_lock(&sctx->stat_lock); +- sctx->stat.unverified_errors++; +- sblock_to_check->data_corrected = 1; +- spin_unlock(&sctx->stat_lock); +- +- if (sctx->is_dev_replace) +- scrub_write_block_to_dev_replace(sblock_bad); +- goto out; +- } +- +- if (!sblock_bad->no_io_error_seen) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.read_errors++; +- spin_unlock(&sctx->stat_lock); +- if (__ratelimit(&rs)) +- scrub_print_warning("i/o error", sblock_to_check); +- btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); +- } else if (sblock_bad->checksum_error) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.csum_errors++; +- spin_unlock(&sctx->stat_lock); +- if (__ratelimit(&rs)) +- scrub_print_warning("checksum error", sblock_to_check); +- btrfs_dev_stat_inc_and_print(dev, +- BTRFS_DEV_STAT_CORRUPTION_ERRS); +- } else if (sblock_bad->header_error) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.verify_errors++; +- spin_unlock(&sctx->stat_lock); +- if (__ratelimit(&rs)) +- scrub_print_warning("checksum/header error", +- sblock_to_check); +- if (sblock_bad->generation_error) +- btrfs_dev_stat_inc_and_print(dev, +- BTRFS_DEV_STAT_GENERATION_ERRS); +- else +- btrfs_dev_stat_inc_and_print(dev, +- BTRFS_DEV_STAT_CORRUPTION_ERRS); +- } +- +- if (sctx->readonly) { +- ASSERT(!sctx->is_dev_replace); +- goto out; +- } +- +- /* +- * now build and submit the bios for the other mirrors, check +- * checksums. +- * First try to pick the mirror which is completely without I/O +- * errors and also does not have a checksum error. +- * If one is found, and if a checksum is present, the full block +- * that is known to contain an error is rewritten. Afterwards +- * the block is known to be corrected. +- * If a mirror is found which is completely correct, and no +- * checksum is present, only those sectors are rewritten that had +- * an I/O error in the block to be repaired, since it cannot be +- * determined, which copy of the other sectors is better (and it +- * could happen otherwise that a correct sector would be +- * overwritten by a bad one). +- */ +- for (mirror_index = 0; ;mirror_index++) { +- struct scrub_block *sblock_other; +- +- if (mirror_index == failed_mirror_index) +- continue; +- +- /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ +- if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { +- if (mirror_index >= BTRFS_MAX_MIRRORS) +- break; +- if (!sblocks_for_recheck[mirror_index]->sector_count) +- break; +- +- sblock_other = sblocks_for_recheck[mirror_index]; +- } else { +- struct scrub_recover *r = sblock_bad->sectors[0]->recover; +- int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; +- +- if (mirror_index >= max_allowed) +- break; +- if (!sblocks_for_recheck[1]->sector_count) +- break; +- +- ASSERT(failed_mirror_index == 0); +- sblock_other = sblocks_for_recheck[1]; +- sblock_other->mirror_num = 1 + mirror_index; +- } +- +- /* build and submit the bios, check checksums */ +- scrub_recheck_block(fs_info, sblock_other, 0); +- +- if (!sblock_other->header_error && +- !sblock_other->checksum_error && +- sblock_other->no_io_error_seen) { +- if (sctx->is_dev_replace) { +- scrub_write_block_to_dev_replace(sblock_other); +- goto corrected_error; +- } else { +- ret = scrub_repair_block_from_good_copy( +- sblock_bad, sblock_other); +- if (!ret) +- goto corrected_error; +- } +- } +- } +- +- if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace) +- goto did_not_correct_error; +- +- /* +- * In case of I/O errors in the area that is supposed to be +- * repaired, continue by picking good copies of those sectors. +- * Select the good sectors from mirrors to rewrite bad sectors from +- * the area to fix. Afterwards verify the checksum of the block +- * that is supposed to be repaired. This verification step is +- * only done for the purpose of statistic counting and for the +- * final scrub report, whether errors remain. +- * A perfect algorithm could make use of the checksum and try +- * all possible combinations of sectors from the different mirrors +- * until the checksum verification succeeds. For example, when +- * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector +- * of mirror #2 is readable but the final checksum test fails, +- * then the 2nd sector of mirror #3 could be tried, whether now +- * the final checksum succeeds. But this would be a rare +- * exception and is therefore not implemented. At least it is +- * avoided that the good copy is overwritten. +- * A more useful improvement would be to pick the sectors +- * without I/O error based on sector sizes (512 bytes on legacy +- * disks) instead of on sectorsize. Then maybe 512 byte of one +- * mirror could be repaired by taking 512 byte of a different +- * mirror, even if other 512 byte sectors in the same sectorsize +- * area are unreadable. +- */ +- success = 1; +- for (sector_num = 0; sector_num < sblock_bad->sector_count; +- sector_num++) { +- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; +- struct scrub_block *sblock_other = NULL; +- +- /* Skip no-io-error sectors in scrub */ +- if (!sector_bad->io_error && !sctx->is_dev_replace) +- continue; +- +- if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { +- /* +- * In case of dev replace, if raid56 rebuild process +- * didn't work out correct data, then copy the content +- * in sblock_bad to make sure target device is identical +- * to source device, instead of writing garbage data in +- * sblock_for_recheck array to target device. +- */ +- sblock_other = NULL; +- } else if (sector_bad->io_error) { +- /* Try to find no-io-error sector in mirrors */ +- for (mirror_index = 0; +- mirror_index < BTRFS_MAX_MIRRORS && +- sblocks_for_recheck[mirror_index]->sector_count > 0; +- mirror_index++) { +- if (!sblocks_for_recheck[mirror_index]-> +- sectors[sector_num]->io_error) { +- sblock_other = sblocks_for_recheck[mirror_index]; +- break; +- } +- } +- if (!sblock_other) +- success = 0; +- } +- +- if (sctx->is_dev_replace) { +- /* +- * Did not find a mirror to fetch the sector from. +- * scrub_write_sector_to_dev_replace() handles this +- * case (sector->io_error), by filling the block with +- * zeros before submitting the write request +- */ +- if (!sblock_other) +- sblock_other = sblock_bad; +- +- if (scrub_write_sector_to_dev_replace(sblock_other, +- sector_num) != 0) { +- atomic64_inc( +- &fs_info->dev_replace.num_write_errors); +- success = 0; +- } +- } else if (sblock_other) { +- ret = scrub_repair_sector_from_good_copy(sblock_bad, +- sblock_other, +- sector_num, 0); +- if (0 == ret) +- sector_bad->io_error = 0; +- else +- success = 0; +- } +- } +- +- if (success && !sctx->is_dev_replace) { +- if (is_metadata || have_csum) { +- /* +- * need to verify the checksum now that all +- * sectors on disk are repaired (the write +- * request for data to be repaired is on its way). +- * Just be lazy and use scrub_recheck_block() +- * which re-reads the data before the checksum +- * is verified, but most likely the data comes out +- * of the page cache. +- */ +- scrub_recheck_block(fs_info, sblock_bad, 1); +- if (!sblock_bad->header_error && +- !sblock_bad->checksum_error && +- sblock_bad->no_io_error_seen) +- goto corrected_error; +- else +- goto did_not_correct_error; +- } else { +-corrected_error: +- spin_lock(&sctx->stat_lock); +- sctx->stat.corrected_errors++; +- sblock_to_check->data_corrected = 1; +- spin_unlock(&sctx->stat_lock); +- btrfs_err_rl_in_rcu(fs_info, +- "fixed up error at logical %llu on dev %s", +- logical, btrfs_dev_name(dev)); +- } +- } else { +-did_not_correct_error: +- spin_lock(&sctx->stat_lock); +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_err_rl_in_rcu(fs_info, +- "unable to fixup (regular) error at logical %llu on dev %s", +- logical, btrfs_dev_name(dev)); +- } +- +-out: +- for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) { +- struct scrub_block *sblock = sblocks_for_recheck[mirror_index]; +- struct scrub_recover *recover; +- int sector_index; +- +- /* Not allocated, continue checking the next mirror */ +- if (!sblock) +- continue; +- +- for (sector_index = 0; sector_index < sblock->sector_count; +- sector_index++) { +- /* +- * Here we just cleanup the recover, each sector will be +- * properly cleaned up by later scrub_block_put() +- */ +- recover = sblock->sectors[sector_index]->recover; +- if (recover) { +- scrub_put_recover(fs_info, recover); +- sblock->sectors[sector_index]->recover = NULL; +- } +- } +- scrub_block_put(sblock); +- } +- +- ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); +- memalloc_nofs_restore(nofs_flag); +- if (ret < 0) +- return ret; +- return 0; +-} +- +-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) ++static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) + { +- if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) +- return 2; +- else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) +- return 3; +- else +- return (int)bioc->num_stripes; +-} ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; + +-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, +- u64 *raid_map, +- int nstripes, int mirror, +- int *stripe_index, +- u64 *stripe_offset) +-{ +- int i; +- +- if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { +- /* RAID5/6 */ +- for (i = 0; i < nstripes; i++) { +- if (raid_map[i] == RAID6_Q_STRIPE || +- raid_map[i] == RAID5_P_STRIPE) +- continue; +- +- if (logical >= raid_map[i] && +- logical < raid_map[i] + BTRFS_STRIPE_LEN) +- break; +- } +- +- *stripe_index = i; +- *stripe_offset = logical - raid_map[i]; +- } else { +- /* The other RAID type */ +- *stripe_index = mirror; +- *stripe_offset = 0; +- } ++ return stripe->pages[page_index]; + } + +-static int scrub_setup_recheck_block(struct scrub_block *original_sblock, +- struct scrub_block *sblocks_for_recheck[]) ++static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, ++ int sector_nr) + { +- struct scrub_ctx *sctx = original_sblock->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- u64 logical = original_sblock->logical; +- u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; +- u64 generation = original_sblock->sectors[0]->generation; +- u64 flags = original_sblock->sectors[0]->flags; +- u64 have_csum = original_sblock->sectors[0]->have_csum; +- struct scrub_recover *recover; +- struct btrfs_io_context *bioc; +- u64 sublen; +- u64 mapped_length; +- u64 stripe_offset; +- int stripe_index; +- int sector_index = 0; +- int mirror_index; +- int nmirrors; +- int ret; +- +- while (length > 0) { +- sublen = min_t(u64, length, fs_info->sectorsize); +- mapped_length = sublen; +- bioc = NULL; ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + +- /* +- * With a length of sectorsize, each returned stripe represents +- * one mirror +- */ +- btrfs_bio_counter_inc_blocked(fs_info); +- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, +- logical, &mapped_length, &bioc); +- if (ret || !bioc || mapped_length < sublen) { +- btrfs_put_bioc(bioc); +- btrfs_bio_counter_dec(fs_info); +- return -EIO; +- } +- +- recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); +- if (!recover) { +- btrfs_put_bioc(bioc); +- btrfs_bio_counter_dec(fs_info); +- return -ENOMEM; +- } +- +- refcount_set(&recover->refs, 1); +- recover->bioc = bioc; +- recover->map_length = mapped_length; +- +- ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); +- +- nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); +- +- for (mirror_index = 0; mirror_index < nmirrors; +- mirror_index++) { +- struct scrub_block *sblock; +- struct scrub_sector *sector; +- +- sblock = sblocks_for_recheck[mirror_index]; +- sblock->sctx = sctx; +- +- sector = alloc_scrub_sector(sblock, logical); +- if (!sector) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- scrub_put_recover(fs_info, recover); +- return -ENOMEM; +- } +- sector->flags = flags; +- sector->generation = generation; +- sector->have_csum = have_csum; +- if (have_csum) +- memcpy(sector->csum, +- original_sblock->sectors[0]->csum, +- sctx->fs_info->csum_size); +- +- scrub_stripe_index_and_offset(logical, +- bioc->map_type, +- bioc->raid_map, +- bioc->num_stripes - +- bioc->num_tgtdevs, +- mirror_index, +- &stripe_index, +- &stripe_offset); +- /* +- * We're at the first sector, also populate @sblock +- * physical and dev. +- */ +- if (sector_index == 0) { +- sblock->physical = +- bioc->stripes[stripe_index].physical + +- stripe_offset; +- sblock->dev = bioc->stripes[stripe_index].dev; +- sblock->physical_for_dev_replace = +- original_sblock->physical_for_dev_replace; +- } +- +- BUG_ON(sector_index >= original_sblock->sector_count); +- scrub_get_recover(recover); +- sector->recover = recover; +- } +- scrub_put_recover(fs_info, recover); +- length -= sublen; +- logical += sublen; +- sector_index++; +- } +- +- return 0; +-} +- +-static void scrub_bio_wait_endio(struct bio *bio) +-{ +- complete(bio->bi_private); ++ return offset_in_page(sector_nr << fs_info->sectorsize_bits); + } + +-static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, +- struct bio *bio, +- struct scrub_sector *sector) ++static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) + { +- DECLARE_COMPLETION_ONSTACK(done); +- +- bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >> +- SECTOR_SHIFT; +- bio->bi_private = &done; +- bio->bi_end_io = scrub_bio_wait_endio; +- raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num); +- +- wait_for_completion_io(&done); +- return blk_status_to_errno(bio->bi_status); +-} +- +-static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, +- struct scrub_block *sblock) +-{ +- struct scrub_sector *first_sector = sblock->sectors[0]; +- struct bio *bio; +- int i; +- +- /* All sectors in sblock belong to the same stripe on the same device. */ +- ASSERT(sblock->dev); +- if (!sblock->dev->bdev) +- goto out; +- +- bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); +- +- for (i = 0; i < sblock->sector_count; i++) { +- struct scrub_sector *sector = sblock->sectors[i]; +- +- bio_add_scrub_sector(bio, sector, fs_info->sectorsize); +- } +- +- if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { +- bio_put(bio); +- goto out; +- } +- +- bio_put(bio); +- +- scrub_recheck_block_checksum(sblock); +- +- return; +-out: +- for (i = 0; i < sblock->sector_count; i++) +- sblock->sectors[i]->io_error = 1; +- +- sblock->no_io_error_seen = 0; +-} +- +-/* +- * This function will check the on disk data for checksum errors, header errors +- * and read I/O errors. If any I/O errors happen, the exact sectors which are +- * errored are marked as being bad. The goal is to enable scrub to take those +- * sectors that are not errored from all the mirrors so that the sectors that +- * are errored in the just handled mirror can be repaired. +- */ +-static void scrub_recheck_block(struct btrfs_fs_info *fs_info, +- struct scrub_block *sblock, +- int retry_failed_mirror) +-{ +- int i; +- +- sblock->no_io_error_seen = 1; +- +- /* short cut for raid56 */ +- if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) +- return scrub_recheck_block_on_raid56(fs_info, sblock); +- +- for (i = 0; i < sblock->sector_count; i++) { +- struct scrub_sector *sector = sblock->sectors[i]; +- struct bio bio; +- struct bio_vec bvec; +- +- if (sblock->dev->bdev == NULL) { +- sector->io_error = 1; +- sblock->no_io_error_seen = 0; +- continue; +- } +- +- bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ); +- bio_add_scrub_sector(&bio, sector, fs_info->sectorsize); +- bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >> +- SECTOR_SHIFT; +- +- btrfsic_check_bio(&bio); +- if (submit_bio_wait(&bio)) { +- sector->io_error = 1; +- sblock->no_io_error_seen = 0; +- } +- +- bio_uninit(&bio); +- } +- +- if (sblock->no_io_error_seen) +- scrub_recheck_block_checksum(sblock); +-} +- +-static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) +-{ +- struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices; +- int ret; +- +- ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); +- return !ret; +-} +- +-static void scrub_recheck_block_checksum(struct scrub_block *sblock) +-{ +- sblock->header_error = 0; +- sblock->checksum_error = 0; +- sblock->generation_error = 0; +- +- if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) +- scrub_checksum_data(sblock); +- else +- scrub_checksum_tree_block(sblock); +-} +- +-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, +- struct scrub_block *sblock_good) +-{ +- int i; +- int ret = 0; +- +- for (i = 0; i < sblock_bad->sector_count; i++) { +- int ret_sub; +- +- ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, +- sblock_good, i, 1); +- if (ret_sub) +- ret = ret_sub; +- } +- +- return ret; +-} +- +-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, +- struct scrub_block *sblock_good, +- int sector_num, int force_write) +-{ +- struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; +- struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; +- struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; +- const u32 sectorsize = fs_info->sectorsize; +- +- if (force_write || sblock_bad->header_error || +- sblock_bad->checksum_error || sector_bad->io_error) { +- struct bio bio; +- struct bio_vec bvec; +- int ret; +- +- if (!sblock_bad->dev->bdev) { +- btrfs_warn_rl(fs_info, +- "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); +- return -EIO; +- } +- +- bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); +- bio.bi_iter.bi_sector = (sblock_bad->physical + +- sector_bad->offset) >> SECTOR_SHIFT; +- ret = bio_add_scrub_sector(&bio, sector_good, sectorsize); +- +- btrfsic_check_bio(&bio); +- ret = submit_bio_wait(&bio); +- bio_uninit(&bio); +- +- if (ret) { +- btrfs_dev_stat_inc_and_print(sblock_bad->dev, +- BTRFS_DEV_STAT_WRITE_ERRS); +- atomic64_inc(&fs_info->dev_replace.num_write_errors); +- return -EIO; +- } +- } +- +- return 0; +-} +- +-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) +-{ +- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; +- int i; +- +- /* +- * This block is used for the check of the parity on the source device, +- * so the data needn't be written into the destination device. +- */ +- if (sblock->sparity) +- return; +- +- for (i = 0; i < sblock->sector_count; i++) { +- int ret; +- +- ret = scrub_write_sector_to_dev_replace(sblock, i); +- if (ret) +- atomic64_inc(&fs_info->dev_replace.num_write_errors); +- } +-} +- +-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) +-{ +- const u32 sectorsize = sblock->sctx->fs_info->sectorsize; +- struct scrub_sector *sector = sblock->sectors[sector_num]; +- +- if (sector->io_error) +- memset(scrub_sector_get_kaddr(sector), 0, sectorsize); +- +- return scrub_add_sector_to_wr_bio(sblock->sctx, sector); +-} +- +-static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) +-{ +- int ret = 0; +- u64 length; +- +- if (!btrfs_is_zoned(sctx->fs_info)) +- return 0; +- +- if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) +- return 0; +- +- if (sctx->write_pointer < physical) { +- length = physical - sctx->write_pointer; +- +- ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev, +- sctx->write_pointer, length); +- if (!ret) +- sctx->write_pointer = physical; +- } +- return ret; +-} +- +-static void scrub_block_get(struct scrub_block *sblock) +-{ +- refcount_inc(&sblock->refs); +-} +- +-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, +- struct scrub_sector *sector) +-{ +- struct scrub_block *sblock = sector->sblock; +- struct scrub_bio *sbio; +- int ret; +- const u32 sectorsize = sctx->fs_info->sectorsize; +- +- mutex_lock(&sctx->wr_lock); +-again: +- if (!sctx->wr_curr_bio) { +- sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio), +- GFP_KERNEL); +- if (!sctx->wr_curr_bio) { +- mutex_unlock(&sctx->wr_lock); +- return -ENOMEM; +- } +- sctx->wr_curr_bio->sctx = sctx; +- sctx->wr_curr_bio->sector_count = 0; +- } +- sbio = sctx->wr_curr_bio; +- if (sbio->sector_count == 0) { +- ret = fill_writer_pointer_gap(sctx, sector->offset + +- sblock->physical_for_dev_replace); +- if (ret) { +- mutex_unlock(&sctx->wr_lock); +- return ret; +- } +- +- sbio->physical = sblock->physical_for_dev_replace + sector->offset; +- sbio->logical = sblock->logical + sector->offset; +- sbio->dev = sctx->wr_tgtdev; +- if (!sbio->bio) { +- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, +- REQ_OP_WRITE, GFP_NOFS); +- } +- sbio->bio->bi_private = sbio; +- sbio->bio->bi_end_io = scrub_wr_bio_end_io; +- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; +- sbio->status = 0; +- } else if (sbio->physical + sbio->sector_count * sectorsize != +- sblock->physical_for_dev_replace + sector->offset || +- sbio->logical + sbio->sector_count * sectorsize != +- sblock->logical + sector->offset) { +- scrub_wr_submit(sctx); +- goto again; +- } +- +- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); +- if (ret != sectorsize) { +- if (sbio->sector_count < 1) { +- bio_put(sbio->bio); +- sbio->bio = NULL; +- mutex_unlock(&sctx->wr_lock); +- return -EIO; +- } +- scrub_wr_submit(sctx); +- goto again; +- } +- +- sbio->sectors[sbio->sector_count] = sector; +- scrub_sector_get(sector); +- /* +- * Since ssector no longer holds a page, but uses sblock::pages, we +- * have to ensure the sblock had not been freed before our write bio +- * finished. +- */ +- scrub_block_get(sector->sblock); +- +- sbio->sector_count++; +- if (sbio->sector_count == sctx->sectors_per_bio) +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); +- +- return 0; +-} +- +-static void scrub_wr_submit(struct scrub_ctx *sctx) +-{ +- struct scrub_bio *sbio; +- +- if (!sctx->wr_curr_bio) +- return; +- +- sbio = sctx->wr_curr_bio; +- sctx->wr_curr_bio = NULL; +- scrub_pending_bio_inc(sctx); +- /* process all writes in a single worker thread. Then the block layer +- * orders the requests before sending them to the driver which +- * doubled the write performance on spinning disks when measured +- * with Linux 3.5 */ +- btrfsic_check_bio(sbio->bio); +- submit_bio(sbio->bio); +- +- if (btrfs_is_zoned(sctx->fs_info)) +- sctx->write_pointer = sbio->physical + sbio->sector_count * +- sctx->fs_info->sectorsize; +-} +- +-static void scrub_wr_bio_end_io(struct bio *bio) +-{ +- struct scrub_bio *sbio = bio->bi_private; +- struct btrfs_fs_info *fs_info = sbio->dev->fs_info; +- +- sbio->status = bio->bi_status; +- sbio->bio = bio; +- +- INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); +- queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); +-} +- +-static void scrub_wr_bio_end_io_worker(struct work_struct *work) +-{ +- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); +- struct scrub_ctx *sctx = sbio->sctx; +- int i; +- +- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); +- if (sbio->status) { +- struct btrfs_dev_replace *dev_replace = +- &sbio->sctx->fs_info->dev_replace; +- +- for (i = 0; i < sbio->sector_count; i++) { +- struct scrub_sector *sector = sbio->sectors[i]; +- +- sector->io_error = 1; +- atomic64_inc(&dev_replace->num_write_errors); +- } +- } +- +- /* +- * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in +- * endio we should put the sblock. +- */ +- for (i = 0; i < sbio->sector_count; i++) { +- scrub_block_put(sbio->sectors[i]->sblock); +- scrub_sector_put(sbio->sectors[i]); +- } +- +- bio_put(sbio->bio); +- kfree(sbio); +- scrub_pending_bio_dec(sctx); +-} +- +-static int scrub_checksum(struct scrub_block *sblock) +-{ +- u64 flags; +- int ret; +- +- /* +- * No need to initialize these stats currently, +- * because this function only use return value +- * instead of these stats value. +- * +- * Todo: +- * always use stats +- */ +- sblock->header_error = 0; +- sblock->generation_error = 0; +- sblock->checksum_error = 0; +- +- WARN_ON(sblock->sector_count < 1); +- flags = sblock->sectors[0]->flags; +- ret = 0; +- if (flags & BTRFS_EXTENT_FLAG_DATA) +- ret = scrub_checksum_data(sblock); +- else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) +- ret = scrub_checksum_tree_block(sblock); +- else if (flags & BTRFS_EXTENT_FLAG_SUPER) +- ret = scrub_checksum_super(sblock); +- else +- WARN_ON(1); +- if (ret) +- scrub_handle_errored_block(sblock); +- +- return ret; +-} +- +-static int scrub_checksum_data(struct scrub_block *sblock) +-{ +- struct scrub_ctx *sctx = sblock->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; ++ const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); ++ const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); ++ const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); +- u8 csum[BTRFS_CSUM_SIZE]; +- struct scrub_sector *sector; +- char *kaddr; +- +- BUG_ON(sblock->sector_count < 1); +- sector = sblock->sectors[0]; +- if (!sector->have_csum) +- return 0; +- +- kaddr = scrub_sector_get_kaddr(sector); +- +- shash->tfm = fs_info->csum_shash; +- crypto_shash_init(shash); +- +- crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); +- +- if (memcmp(csum, sector->csum, fs_info->csum_size)) +- sblock->checksum_error = 1; +- return sblock->checksum_error; +-} +- +-static int scrub_checksum_tree_block(struct scrub_block *sblock) +-{ +- struct scrub_ctx *sctx = sblock->sctx; +- struct btrfs_header *h; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); +- u8 calculated_csum[BTRFS_CSUM_SIZE]; + u8 on_disk_csum[BTRFS_CSUM_SIZE]; +- /* +- * This is done in sectorsize steps even for metadata as there's a +- * constraint for nodesize to be aligned to sectorsize. This will need +- * to change so we don't misuse data and metadata units like that. +- */ +- const u32 sectorsize = sctx->fs_info->sectorsize; +- const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; +- int i; +- struct scrub_sector *sector; +- char *kaddr; +- +- BUG_ON(sblock->sector_count < 1); +- +- /* Each member in sectors is just one sector */ +- ASSERT(sblock->sector_count == num_sectors); +- +- sector = sblock->sectors[0]; +- kaddr = scrub_sector_get_kaddr(sector); +- h = (struct btrfs_header *)kaddr; +- memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); ++ u8 calculated_csum[BTRFS_CSUM_SIZE]; ++ struct btrfs_header *header; + + /* +- * we don't use the getter functions here, as we +- * a) don't have an extent buffer and +- * b) the page is already kmapped ++ * Here we don't have a good way to attach the pages (and subpages) ++ * to a dummy extent buffer, thus we have to directly grab the members ++ * from pages. + */ +- if (sblock->logical != btrfs_stack_header_bytenr(h)) { +- sblock->header_error = 1; ++ header = (struct btrfs_header *)(page_address(first_page) + first_off); ++ memcpy(on_disk_csum, header->csum, fs_info->csum_size); ++ ++ if (logical != btrfs_stack_header_bytenr(header)) { ++ bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad bytenr, has %llu want %llu", +- sblock->logical, sblock->mirror_num, +- btrfs_stack_header_bytenr(h), +- sblock->logical); +- goto out; ++ logical, stripe->mirror_num, ++ btrfs_stack_header_bytenr(header), logical); ++ return; + } +- +- if (!scrub_check_fsid(h->fsid, sector)) { +- sblock->header_error = 1; ++ if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) { ++ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad fsid, has %pU want %pU", +- sblock->logical, sblock->mirror_num, +- h->fsid, sblock->dev->fs_devices->fsid); +- goto out; ++ logical, stripe->mirror_num, ++ header->fsid, fs_info->fs_devices->fsid); ++ return; + } +- +- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { +- sblock->header_error = 1; ++ if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, ++ BTRFS_UUID_SIZE) != 0) { ++ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", +- sblock->logical, sblock->mirror_num, +- h->chunk_tree_uuid, fs_info->chunk_tree_uuid); +- goto out; ++ logical, stripe->mirror_num, ++ header->chunk_tree_uuid, fs_info->chunk_tree_uuid); ++ return; + } + ++ /* Now check tree block csum. */ + shash->tfm = fs_info->csum_shash; + crypto_shash_init(shash); +- crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, +- sectorsize - BTRFS_CSUM_SIZE); ++ crypto_shash_update(shash, page_address(first_page) + first_off + ++ BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); ++ ++ for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { ++ struct page *page = scrub_stripe_get_page(stripe, i); ++ unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); + +- for (i = 1; i < num_sectors; i++) { +- kaddr = scrub_sector_get_kaddr(sblock->sectors[i]); +- crypto_shash_update(shash, kaddr, sectorsize); ++ crypto_shash_update(shash, page_address(page) + page_off, ++ fs_info->sectorsize); + } + + crypto_shash_final(shash, calculated_csum); +- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { +- sblock->checksum_error = 1; ++ if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { ++ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, +- sblock->logical, sblock->mirror_num, ++ logical, stripe->mirror_num, + CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), + CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); +- goto out; ++ return; + } +- +- if (sector->generation != btrfs_stack_header_generation(h)) { +- sblock->header_error = 1; +- sblock->generation_error = 1; ++ if (stripe->sectors[sector_nr].generation != ++ btrfs_stack_header_generation(header)) { ++ bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + btrfs_warn_rl(fs_info, + "tree block %llu mirror %u has bad generation, has %llu want %llu", +- sblock->logical, sblock->mirror_num, +- btrfs_stack_header_generation(h), +- sector->generation); +- } +- +-out: +- return sblock->header_error || sblock->checksum_error; +-} +- +-static int scrub_checksum_super(struct scrub_block *sblock) +-{ +- struct btrfs_super_block *s; +- struct scrub_ctx *sctx = sblock->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); +- u8 calculated_csum[BTRFS_CSUM_SIZE]; +- struct scrub_sector *sector; +- char *kaddr; +- int fail_gen = 0; +- int fail_cor = 0; +- +- BUG_ON(sblock->sector_count < 1); +- sector = sblock->sectors[0]; +- kaddr = scrub_sector_get_kaddr(sector); +- s = (struct btrfs_super_block *)kaddr; +- +- if (sblock->logical != btrfs_super_bytenr(s)) +- ++fail_cor; +- +- if (sector->generation != btrfs_super_generation(s)) +- ++fail_gen; +- +- if (!scrub_check_fsid(s->fsid, sector)) +- ++fail_cor; +- +- shash->tfm = fs_info->csum_shash; +- crypto_shash_init(shash); +- crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE, +- BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum); +- +- if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size)) +- ++fail_cor; +- +- return fail_cor + fail_gen; +-} +- +-static void scrub_block_put(struct scrub_block *sblock) +-{ +- if (refcount_dec_and_test(&sblock->refs)) { +- int i; +- +- if (sblock->sparity) +- scrub_parity_put(sblock->sparity); +- +- for (i = 0; i < sblock->sector_count; i++) +- scrub_sector_put(sblock->sectors[i]); +- for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) { +- if (sblock->pages[i]) { +- detach_scrub_page_private(sblock->pages[i]); +- __free_page(sblock->pages[i]); +- } +- } +- kfree(sblock); ++ logical, stripe->mirror_num, ++ btrfs_stack_header_generation(header), ++ stripe->sectors[sector_nr].generation); ++ return; + } ++ bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); ++ bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + } + +-static void scrub_sector_get(struct scrub_sector *sector) +-{ +- atomic_inc(§or->refs); +-} +- +-static void scrub_sector_put(struct scrub_sector *sector) ++static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) + { +- if (atomic_dec_and_test(§or->refs)) +- kfree(sector); +-} ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; ++ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; ++ struct page *page = scrub_stripe_get_page(stripe, sector_nr); ++ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); ++ u8 csum_buf[BTRFS_CSUM_SIZE]; ++ int ret; + +-/* +- * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 +- * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. +- */ +-static void scrub_throttle(struct scrub_ctx *sctx) +-{ +- const int time_slice = 1000; +- struct scrub_bio *sbio; +- struct btrfs_device *device; +- s64 delta; +- ktime_t now; +- u32 div; +- u64 bwlimit; ++ ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); + +- sbio = sctx->bios[sctx->curr]; +- device = sbio->dev; +- bwlimit = READ_ONCE(device->scrub_speed_max); +- if (bwlimit == 0) ++ /* Sector not utilized, skip it. */ ++ if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + return; + +- /* +- * Slice is divided into intervals when the IO is submitted, adjust by +- * bwlimit and maximum of 64 intervals. +- */ +- div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); +- div = min_t(u32, 64, div); +- +- /* Start new epoch, set deadline */ +- now = ktime_get(); +- if (sctx->throttle_deadline == 0) { +- sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); +- sctx->throttle_sent = 0; +- } ++ /* IO error, no need to check. */ ++ if (test_bit(sector_nr, &stripe->io_error_bitmap)) ++ return; + +- /* Still in the time to send? */ +- if (ktime_before(now, sctx->throttle_deadline)) { +- /* If current bio is within the limit, send it */ +- sctx->throttle_sent += sbio->bio->bi_iter.bi_size; +- if (sctx->throttle_sent <= div_u64(bwlimit, div)) ++ /* Metadata, verify the full tree block. */ ++ if (sector->is_metadata) { ++ /* ++ * Check if the tree block crosses the stripe boudary. If ++ * crossed the boundary, we cannot verify it but only give a ++ * warning. ++ * ++ * This can only happen on a very old filesystem where chunks ++ * are not ensured to be stripe aligned. ++ */ ++ if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) { ++ btrfs_warn_rl(fs_info, ++ "tree block at %llu crosses stripe boundary %llu", ++ stripe->logical + ++ (sector_nr << fs_info->sectorsize_bits), ++ stripe->logical); + return; +- +- /* We're over the limit, sleep until the rest of the slice */ +- delta = ktime_ms_delta(sctx->throttle_deadline, now); +- } else { +- /* New request after deadline, start new epoch */ +- delta = 0; +- } +- +- if (delta) { +- long timeout; +- +- timeout = div_u64(delta * HZ, 1000); +- schedule_timeout_interruptible(timeout); +- } +- +- /* Next call will start the deadline period */ +- sctx->throttle_deadline = 0; +-} +- +-static void scrub_submit(struct scrub_ctx *sctx) +-{ +- struct scrub_bio *sbio; +- +- if (sctx->curr == -1) ++ } ++ scrub_verify_one_metadata(stripe, sector_nr); + return; ++ } + +- scrub_throttle(sctx); +- +- sbio = sctx->bios[sctx->curr]; +- sctx->curr = -1; +- scrub_pending_bio_inc(sctx); +- btrfsic_check_bio(sbio->bio); +- submit_bio(sbio->bio); +-} +- +-static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, +- struct scrub_sector *sector) +-{ +- struct scrub_block *sblock = sector->sblock; +- struct scrub_bio *sbio; +- const u32 sectorsize = sctx->fs_info->sectorsize; +- int ret; +- +-again: + /* +- * grab a fresh bio or wait for one to become available ++ * Data is easier, we just verify the data csum (if we have it). For ++ * cases without csum, we have no other choice but to trust it. + */ +- while (sctx->curr == -1) { +- spin_lock(&sctx->list_lock); +- sctx->curr = sctx->first_free; +- if (sctx->curr != -1) { +- sctx->first_free = sctx->bios[sctx->curr]->next_free; +- sctx->bios[sctx->curr]->next_free = -1; +- sctx->bios[sctx->curr]->sector_count = 0; +- spin_unlock(&sctx->list_lock); +- } else { +- spin_unlock(&sctx->list_lock); +- wait_event(sctx->list_wait, sctx->first_free != -1); +- } +- } +- sbio = sctx->bios[sctx->curr]; +- if (sbio->sector_count == 0) { +- sbio->physical = sblock->physical + sector->offset; +- sbio->logical = sblock->logical + sector->offset; +- sbio->dev = sblock->dev; +- if (!sbio->bio) { +- sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, +- REQ_OP_READ, GFP_NOFS); +- } +- sbio->bio->bi_private = sbio; +- sbio->bio->bi_end_io = scrub_bio_end_io; +- sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; +- sbio->status = 0; +- } else if (sbio->physical + sbio->sector_count * sectorsize != +- sblock->physical + sector->offset || +- sbio->logical + sbio->sector_count * sectorsize != +- sblock->logical + sector->offset || +- sbio->dev != sblock->dev) { +- scrub_submit(sctx); +- goto again; ++ if (!sector->csum) { ++ clear_bit(sector_nr, &stripe->error_bitmap); ++ return; + } + +- sbio->sectors[sbio->sector_count] = sector; +- ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize); +- if (ret != sectorsize) { +- if (sbio->sector_count < 1) { +- bio_put(sbio->bio); +- sbio->bio = NULL; +- return -EIO; +- } +- scrub_submit(sctx); +- goto again; ++ ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); ++ if (ret < 0) { ++ set_bit(sector_nr, &stripe->csum_error_bitmap); ++ set_bit(sector_nr, &stripe->error_bitmap); ++ } else { ++ clear_bit(sector_nr, &stripe->csum_error_bitmap); ++ clear_bit(sector_nr, &stripe->error_bitmap); + } +- +- scrub_block_get(sblock); /* one for the page added to the bio */ +- atomic_inc(&sblock->outstanding_sectors); +- sbio->sector_count++; +- if (sbio->sector_count == sctx->sectors_per_bio) +- scrub_submit(sctx); +- +- return 0; +-} +- +-static void scrub_missing_raid56_end_io(struct bio *bio) +-{ +- struct scrub_block *sblock = bio->bi_private; +- struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; +- +- btrfs_bio_counter_dec(fs_info); +- if (bio->bi_status) +- sblock->no_io_error_seen = 0; +- +- bio_put(bio); +- +- queue_work(fs_info->scrub_workers, &sblock->work); + } + +-static void scrub_missing_raid56_worker(struct work_struct *work) ++/* Verify specified sectors of a stripe. */ ++static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap) + { +- struct scrub_block *sblock = container_of(work, struct scrub_block, work); +- struct scrub_ctx *sctx = sblock->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- u64 logical; +- struct btrfs_device *dev; +- +- logical = sblock->logical; +- dev = sblock->dev; +- +- if (sblock->no_io_error_seen) +- scrub_recheck_block_checksum(sblock); +- +- if (!sblock->no_io_error_seen) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.read_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_err_rl_in_rcu(fs_info, +- "IO error rebuilding logical %llu for dev %s", +- logical, btrfs_dev_name(dev)); +- } else if (sblock->header_error || sblock->checksum_error) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_err_rl_in_rcu(fs_info, +- "failed to rebuild valid logical %llu for dev %s", +- logical, btrfs_dev_name(dev)); +- } else { +- scrub_write_block_to_dev_replace(sblock); +- } ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; ++ int sector_nr; + +- if (sctx->is_dev_replace && sctx->flush_all_writes) { +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); ++ for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { ++ scrub_verify_one_sector(stripe, sector_nr); ++ if (stripe->sectors[sector_nr].is_metadata) ++ sector_nr += sectors_per_tree - 1; + } +- +- scrub_block_put(sblock); +- scrub_pending_bio_dec(sctx); + } + +-static void scrub_missing_raid56_pages(struct scrub_block *sblock) ++static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec) + { +- struct scrub_ctx *sctx = sblock->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- u64 length = sblock->sector_count << fs_info->sectorsize_bits; +- u64 logical = sblock->logical; +- struct btrfs_io_context *bioc = NULL; +- struct bio *bio; +- struct btrfs_raid_bio *rbio; +- int ret; + int i; + +- btrfs_bio_counter_inc_blocked(fs_info); +- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, +- &length, &bioc); +- if (ret || !bioc || !bioc->raid_map) +- goto bioc_out; +- +- if (WARN_ON(!sctx->is_dev_replace || +- !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { +- /* +- * We shouldn't be scrubbing a missing device. Even for dev +- * replace, we should only get here for RAID 5/6. We either +- * managed to mount something with no mirrors remaining or +- * there's a bug in scrub_find_good_copy()/btrfs_map_block(). +- */ +- goto bioc_out; +- } +- +- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); +- bio->bi_iter.bi_sector = logical >> 9; +- bio->bi_private = sblock; +- bio->bi_end_io = scrub_missing_raid56_end_io; +- +- rbio = raid56_alloc_missing_rbio(bio, bioc); +- if (!rbio) +- goto rbio_out; +- +- for (i = 0; i < sblock->sector_count; i++) { +- struct scrub_sector *sector = sblock->sectors[i]; +- +- raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector), +- scrub_sector_get_page_offset(sector), +- sector->offset + sector->sblock->logical); ++ for (i = 0; i < stripe->nr_sectors; i++) { ++ if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && ++ scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) ++ break; + } +- +- INIT_WORK(&sblock->work, scrub_missing_raid56_worker); +- scrub_block_get(sblock); +- scrub_pending_bio_inc(sctx); +- raid56_submit_missing_rbio(rbio); +- btrfs_put_bioc(bioc); +- return; +- +-rbio_out: +- bio_put(bio); +-bioc_out: +- btrfs_bio_counter_dec(fs_info); +- btrfs_put_bioc(bioc); +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); ++ ASSERT(i < stripe->nr_sectors); ++ return i; + } + +-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, +- u64 physical, struct btrfs_device *dev, u64 flags, +- u64 gen, int mirror_num, u8 *csum, +- u64 physical_for_dev_replace) ++/* ++ * Repair read is different to the regular read: ++ * ++ * - Only reads the failed sectors ++ * - May have extra blocksize limits ++ */ ++static void scrub_repair_read_endio(struct btrfs_bio *bbio) + { +- struct scrub_block *sblock; +- const u32 sectorsize = sctx->fs_info->sectorsize; +- int index; +- +- sblock = alloc_scrub_block(sctx, dev, logical, physical, +- physical_for_dev_replace, mirror_num); +- if (!sblock) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- return -ENOMEM; +- } ++ struct scrub_stripe *stripe = bbio->private; ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ struct bio_vec *bvec; ++ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); ++ u32 bio_size = 0; ++ int i; + +- for (index = 0; len > 0; index++) { +- struct scrub_sector *sector; +- /* +- * Here we will allocate one page for one sector to scrub. +- * This is fine if PAGE_SIZE == sectorsize, but will cost +- * more memory for PAGE_SIZE > sectorsize case. +- */ +- u32 l = min(sectorsize, len); ++ ASSERT(sector_nr < stripe->nr_sectors); + +- sector = alloc_scrub_sector(sblock, logical); +- if (!sector) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- scrub_block_put(sblock); +- return -ENOMEM; +- } +- sector->flags = flags; +- sector->generation = gen; +- if (csum) { +- sector->have_csum = 1; +- memcpy(sector->csum, csum, sctx->fs_info->csum_size); +- } else { +- sector->have_csum = 0; +- } +- len -= l; +- logical += l; +- physical += l; +- physical_for_dev_replace += l; +- } ++ bio_for_each_bvec_all(bvec, &bbio->bio, i) ++ bio_size += bvec->bv_len; + +- WARN_ON(sblock->sector_count == 0); +- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { +- /* +- * This case should only be hit for RAID 5/6 device replace. See +- * the comment in scrub_missing_raid56_pages() for details. +- */ +- scrub_missing_raid56_pages(sblock); ++ if (bbio->bio.bi_status) { ++ bitmap_set(&stripe->io_error_bitmap, sector_nr, ++ bio_size >> fs_info->sectorsize_bits); ++ bitmap_set(&stripe->error_bitmap, sector_nr, ++ bio_size >> fs_info->sectorsize_bits); + } else { +- for (index = 0; index < sblock->sector_count; index++) { +- struct scrub_sector *sector = sblock->sectors[index]; +- int ret; +- +- ret = scrub_add_sector_to_rd_bio(sctx, sector); +- if (ret) { +- scrub_block_put(sblock); +- return ret; +- } +- } +- +- if (flags & BTRFS_EXTENT_FLAG_SUPER) +- scrub_submit(sctx); ++ bitmap_clear(&stripe->io_error_bitmap, sector_nr, ++ bio_size >> fs_info->sectorsize_bits); + } +- +- /* last one frees, either here or in bio completion for last page */ +- scrub_block_put(sblock); +- return 0; ++ bio_put(&bbio->bio); ++ if (atomic_dec_and_test(&stripe->pending_io)) ++ wake_up(&stripe->io_wait); + } + +-static void scrub_bio_end_io(struct bio *bio) ++static int calc_next_mirror(int mirror, int num_copies) + { +- struct scrub_bio *sbio = bio->bi_private; +- struct btrfs_fs_info *fs_info = sbio->dev->fs_info; +- +- sbio->status = bio->bi_status; +- sbio->bio = bio; +- +- queue_work(fs_info->scrub_workers, &sbio->work); ++ ASSERT(mirror <= num_copies); ++ return (mirror + 1 > num_copies) ? 1 : mirror + 1; + } + +-static void scrub_bio_end_io_worker(struct work_struct *work) ++static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, ++ int mirror, int blocksize, bool wait) + { +- struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); +- struct scrub_ctx *sctx = sbio->sctx; ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ struct btrfs_bio *bbio = NULL; ++ const unsigned long old_error_bitmap = stripe->error_bitmap; + int i; + +- ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); +- if (sbio->status) { +- for (i = 0; i < sbio->sector_count; i++) { +- struct scrub_sector *sector = sbio->sectors[i]; ++ ASSERT(stripe->mirror_num >= 1); ++ ASSERT(atomic_read(&stripe->pending_io) == 0); ++ ++ for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { ++ struct page *page; ++ int pgoff; ++ int ret; + +- sector->io_error = 1; +- sector->sblock->no_io_error_seen = 0; ++ page = scrub_stripe_get_page(stripe, i); ++ pgoff = scrub_stripe_get_page_offset(stripe, i); ++ ++ /* The current sector cannot be merged, submit the bio. */ ++ if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || ++ bbio->bio.bi_iter.bi_size >= blocksize)) { ++ ASSERT(bbio->bio.bi_iter.bi_size); ++ atomic_inc(&stripe->pending_io); ++ btrfs_submit_bio(bbio, mirror); ++ if (wait) ++ wait_scrub_stripe_io(stripe); ++ bbio = NULL; + } +- } + +- /* Now complete the scrub_block items that have all pages completed */ +- for (i = 0; i < sbio->sector_count; i++) { +- struct scrub_sector *sector = sbio->sectors[i]; +- struct scrub_block *sblock = sector->sblock; ++ if (!bbio) { ++ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, ++ fs_info, scrub_repair_read_endio, stripe); ++ bbio->bio.bi_iter.bi_sector = (stripe->logical + ++ (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; ++ } + +- if (atomic_dec_and_test(&sblock->outstanding_sectors)) +- scrub_block_complete(sblock); +- scrub_block_put(sblock); ++ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); ++ ASSERT(ret == fs_info->sectorsize); + } +- +- bio_put(sbio->bio); +- sbio->bio = NULL; +- spin_lock(&sctx->list_lock); +- sbio->next_free = sctx->first_free; +- sctx->first_free = sbio->index; +- spin_unlock(&sctx->list_lock); +- +- if (sctx->is_dev_replace && sctx->flush_all_writes) { +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); ++ if (bbio) { ++ ASSERT(bbio->bio.bi_iter.bi_size); ++ atomic_inc(&stripe->pending_io); ++ btrfs_submit_bio(bbio, mirror); ++ if (wait) ++ wait_scrub_stripe_io(stripe); + } +- +- scrub_pending_bio_dec(sctx); + } + +-static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, +- unsigned long *bitmap, +- u64 start, u32 len) ++static void scrub_stripe_report_errors(struct scrub_ctx *sctx, ++ struct scrub_stripe *stripe) + { +- u64 offset; +- u32 nsectors; +- u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits; +- +- if (len >= sparity->stripe_len) { +- bitmap_set(bitmap, 0, sparity->nsectors); ++ static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, ++ DEFAULT_RATELIMIT_BURST); ++ struct btrfs_fs_info *fs_info = sctx->fs_info; ++ struct btrfs_device *dev = NULL; ++ u64 physical = 0; ++ int nr_data_sectors = 0; ++ int nr_meta_sectors = 0; ++ int nr_nodatacsum_sectors = 0; ++ int nr_repaired_sectors = 0; ++ int sector_nr; ++ ++ if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state)) + return; +- } + +- start -= sparity->logic_start; +- start = div64_u64_rem(start, sparity->stripe_len, &offset); +- offset = offset >> sectorsize_bits; +- nsectors = len >> sectorsize_bits; ++ /* ++ * Init needed infos for error reporting. ++ * ++ * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() ++ * thus no need for dev/physical, error reporting still needs dev and physical. ++ */ ++ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { ++ u64 mapped_len = fs_info->sectorsize; ++ struct btrfs_io_context *bioc = NULL; ++ int stripe_index = stripe->mirror_num - 1; ++ int ret; + +- if (offset + nsectors <= sparity->nsectors) { +- bitmap_set(bitmap, offset, nsectors); +- return; ++ /* For scrub, our mirror_num should always start at 1. */ ++ ASSERT(stripe->mirror_num >= 1); ++ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, ++ stripe->logical, &mapped_len, &bioc); ++ /* ++ * If we failed, dev will be NULL, and later detailed reports ++ * will just be skipped. ++ */ ++ if (ret < 0) ++ goto skip; ++ physical = bioc->stripes[stripe_index].physical; ++ dev = bioc->stripes[stripe_index].dev; ++ btrfs_put_bioc(bioc); + } + +- bitmap_set(bitmap, offset, sparity->nsectors - offset); +- bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset)); +-} ++skip: ++ for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { ++ bool repaired = false; + +-static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, +- u64 start, u32 len) +-{ +- __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); +-} ++ if (stripe->sectors[sector_nr].is_metadata) { ++ nr_meta_sectors++; ++ } else { ++ nr_data_sectors++; ++ if (!stripe->sectors[sector_nr].csum) ++ nr_nodatacsum_sectors++; ++ } + +-static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, +- u64 start, u32 len) +-{ +- __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); +-} ++ if (test_bit(sector_nr, &stripe->init_error_bitmap) && ++ !test_bit(sector_nr, &stripe->error_bitmap)) { ++ nr_repaired_sectors++; ++ repaired = true; ++ } + +-static void scrub_block_complete(struct scrub_block *sblock) +-{ +- int corrupted = 0; ++ /* Good sector from the beginning, nothing need to be done. */ ++ if (!test_bit(sector_nr, &stripe->init_error_bitmap)) ++ continue; + +- if (!sblock->no_io_error_seen) { +- corrupted = 1; +- scrub_handle_errored_block(sblock); +- } else { + /* +- * if has checksum error, write via repair mechanism in +- * dev replace case, otherwise write here in dev replace +- * case. ++ * Report error for the corrupted sectors. If repaired, just ++ * output the message of repaired message. + */ +- corrupted = scrub_checksum(sblock); +- if (!corrupted && sblock->sctx->is_dev_replace) +- scrub_write_block_to_dev_replace(sblock); +- } ++ if (repaired) { ++ if (dev) { ++ btrfs_err_rl_in_rcu(fs_info, ++ "fixed up error at logical %llu on dev %s physical %llu", ++ stripe->logical, btrfs_dev_name(dev), ++ physical); ++ } else { ++ btrfs_err_rl_in_rcu(fs_info, ++ "fixed up error at logical %llu on mirror %u", ++ stripe->logical, stripe->mirror_num); ++ } ++ continue; ++ } + +- if (sblock->sparity && corrupted && !sblock->data_corrected) { +- u64 start = sblock->logical; +- u64 end = sblock->logical + +- sblock->sectors[sblock->sector_count - 1]->offset + +- sblock->sctx->fs_info->sectorsize; ++ /* The remaining are all for unrepaired. */ ++ if (dev) { ++ btrfs_err_rl_in_rcu(fs_info, ++ "unable to fixup (regular) error at logical %llu on dev %s physical %llu", ++ stripe->logical, btrfs_dev_name(dev), ++ physical); ++ } else { ++ btrfs_err_rl_in_rcu(fs_info, ++ "unable to fixup (regular) error at logical %llu on mirror %u", ++ stripe->logical, stripe->mirror_num); ++ } + +- ASSERT(end - start <= U32_MAX); +- scrub_parity_mark_sectors_error(sblock->sparity, +- start, end - start); ++ if (test_bit(sector_nr, &stripe->io_error_bitmap)) ++ if (__ratelimit(&rs) && dev) ++ scrub_print_common_warning("i/o error", dev, false, ++ stripe->logical, physical); ++ if (test_bit(sector_nr, &stripe->csum_error_bitmap)) ++ if (__ratelimit(&rs) && dev) ++ scrub_print_common_warning("checksum error", dev, false, ++ stripe->logical, physical); ++ if (test_bit(sector_nr, &stripe->meta_error_bitmap)) ++ if (__ratelimit(&rs) && dev) ++ scrub_print_common_warning("header error", dev, false, ++ stripe->logical, physical); + } +-} + +-static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum) +-{ +- sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits; +- list_del(&sum->list); +- kfree(sum); ++ spin_lock(&sctx->stat_lock); ++ sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; ++ sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; ++ sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; ++ sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; ++ sctx->stat.no_csum += nr_nodatacsum_sectors; ++ sctx->stat.read_errors += ++ bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors); ++ sctx->stat.csum_errors += ++ bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors); ++ sctx->stat.verify_errors += ++ bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors); ++ sctx->stat.uncorrectable_errors += ++ bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); ++ sctx->stat.corrected_errors += nr_repaired_sectors; ++ spin_unlock(&sctx->stat_lock); + } + + /* +- * Find the desired csum for range [logical, logical + sectorsize), and store +- * the csum into @csum. ++ * The main entrance for all read related scrub work, including: + * +- * The search source is sctx->csum_list, which is a pre-populated list +- * storing bytenr ordered csum ranges. We're responsible to cleanup any range +- * that is before @logical. ++ * - Wait for the initial read to finish ++ * - Verify and locate any bad sectors ++ * - Go through the remaining mirrors and try to read as large blocksize as ++ * possible ++ * - Go through all mirrors (including the failed mirror) sector-by-sector + * +- * Return 0 if there is no csum for the range. +- * Return 1 if there is csum for the range and copied to @csum. ++ * Writeback does not happen here, it needs extra synchronization. + */ +-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ++static void scrub_stripe_read_repair_worker(struct work_struct *work) + { +- bool found = false; +- +- while (!list_empty(&sctx->csum_list)) { +- struct btrfs_ordered_sum *sum = NULL; +- unsigned long index; +- unsigned long num_sectors; +- +- sum = list_first_entry(&sctx->csum_list, +- struct btrfs_ordered_sum, list); +- /* The current csum range is beyond our range, no csum found */ +- if (sum->bytenr > logical) +- break; +- +- /* +- * The current sum is before our bytenr, since scrub is always +- * done in bytenr order, the csum will never be used anymore, +- * clean it up so that later calls won't bother with the range, +- * and continue search the next range. +- */ +- if (sum->bytenr + sum->len <= logical) { +- drop_csum_range(sctx, sum); +- continue; +- } +- +- /* Now the csum range covers our bytenr, copy the csum */ +- found = true; +- index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits; +- num_sectors = sum->len >> sctx->fs_info->sectorsize_bits; ++ struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, ++ stripe->bg->length); ++ int mirror; ++ int i; + +- memcpy(csum, sum->sums + index * sctx->fs_info->csum_size, +- sctx->fs_info->csum_size); ++ ASSERT(stripe->mirror_num > 0); + +- /* Cleanup the range if we're at the end of the csum range */ +- if (index == num_sectors - 1) +- drop_csum_range(sctx, sum); +- break; +- } +- if (!found) +- return 0; +- return 1; +-} ++ wait_scrub_stripe_io(stripe); ++ scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); ++ /* Save the initial failed bitmap for later repair and report usage. */ ++ stripe->init_error_bitmap = stripe->error_bitmap; + +-/* scrub extent tries to collect up to 64 kB for each bio */ +-static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, +- u64 logical, u32 len, +- u64 physical, struct btrfs_device *dev, u64 flags, +- u64 gen, int mirror_num) +-{ +- struct btrfs_device *src_dev = dev; +- u64 src_physical = physical; +- int src_mirror = mirror_num; +- int ret; +- u8 csum[BTRFS_CSUM_SIZE]; +- u32 blocksize; ++ if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) ++ goto out; + +- if (flags & BTRFS_EXTENT_FLAG_DATA) { +- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) +- blocksize = map->stripe_len; +- else +- blocksize = sctx->fs_info->sectorsize; +- spin_lock(&sctx->stat_lock); +- sctx->stat.data_extents_scrubbed++; +- sctx->stat.data_bytes_scrubbed += len; +- spin_unlock(&sctx->stat_lock); +- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) +- blocksize = map->stripe_len; +- else +- blocksize = sctx->fs_info->nodesize; +- spin_lock(&sctx->stat_lock); +- sctx->stat.tree_extents_scrubbed++; +- sctx->stat.tree_bytes_scrubbed += len; +- spin_unlock(&sctx->stat_lock); +- } else { +- blocksize = sctx->fs_info->sectorsize; +- WARN_ON(1); ++ /* ++ * Try all remaining mirrors. ++ * ++ * Here we still try to read as large block as possible, as this is ++ * faster and we have extra safety nets to rely on. ++ */ ++ for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); ++ mirror != stripe->mirror_num; ++ mirror = calc_next_mirror(mirror, num_copies)) { ++ const unsigned long old_error_bitmap = stripe->error_bitmap; ++ ++ scrub_stripe_submit_repair_read(stripe, mirror, ++ BTRFS_STRIPE_LEN, false); ++ wait_scrub_stripe_io(stripe); ++ scrub_verify_one_stripe(stripe, old_error_bitmap); ++ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) ++ goto out; + } + + /* +- * For dev-replace case, we can have @dev being a missing device. +- * Regular scrub will avoid its execution on missing device at all, +- * as that would trigger tons of read error. ++ * Last safety net, try re-checking all mirrors, including the failed ++ * one, sector-by-sector. + * +- * Reading from missing device will cause read error counts to +- * increase unnecessarily. +- * So here we change the read source to a good mirror. ++ * As if one sector failed the drive's internal csum, the whole read ++ * containing the offending sector would be marked as error. ++ * Thus here we do sector-by-sector read. ++ * ++ * This can be slow, thus we only try it as the last resort. + */ +- if (sctx->is_dev_replace && !dev->bdev) +- scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, +- &src_dev, &src_mirror); +- while (len) { +- u32 l = min(len, blocksize); +- int have_csum = 0; +- +- if (flags & BTRFS_EXTENT_FLAG_DATA) { +- /* push csums to sbio */ +- have_csum = scrub_find_csum(sctx, logical, csum); +- if (have_csum == 0) +- ++sctx->stat.no_csum; +- } +- ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, +- flags, gen, src_mirror, +- have_csum ? csum : NULL, physical); +- if (ret) +- return ret; +- len -= l; +- logical += l; +- physical += l; +- src_physical += l; ++ ++ for (i = 0, mirror = stripe->mirror_num; ++ i < num_copies; ++ i++, mirror = calc_next_mirror(mirror, num_copies)) { ++ const unsigned long old_error_bitmap = stripe->error_bitmap; ++ ++ scrub_stripe_submit_repair_read(stripe, mirror, ++ fs_info->sectorsize, true); ++ wait_scrub_stripe_io(stripe); ++ scrub_verify_one_stripe(stripe, old_error_bitmap); ++ if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) ++ goto out; + } +- return 0; ++out: ++ scrub_stripe_report_errors(stripe->sctx, stripe); ++ set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); ++ wake_up(&stripe->repair_wait); + } + +-static int scrub_sectors_for_parity(struct scrub_parity *sparity, +- u64 logical, u32 len, +- u64 physical, struct btrfs_device *dev, +- u64 flags, u64 gen, int mirror_num, u8 *csum) ++static void scrub_read_endio(struct btrfs_bio *bbio) + { +- struct scrub_ctx *sctx = sparity->sctx; +- struct scrub_block *sblock; +- const u32 sectorsize = sctx->fs_info->sectorsize; +- int index; +- +- ASSERT(IS_ALIGNED(len, sectorsize)); ++ struct scrub_stripe *stripe = bbio->private; + +- sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num); +- if (!sblock) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- return -ENOMEM; ++ if (bbio->bio.bi_status) { ++ bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); ++ bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); ++ } else { ++ bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); ++ } ++ bio_put(&bbio->bio); ++ if (atomic_dec_and_test(&stripe->pending_io)) { ++ wake_up(&stripe->io_wait); ++ INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); ++ queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } ++} + +- sblock->sparity = sparity; +- scrub_parity_get(sparity); ++static void scrub_write_endio(struct btrfs_bio *bbio) ++{ ++ struct scrub_stripe *stripe = bbio->private; ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ struct bio_vec *bvec; ++ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); ++ u32 bio_size = 0; ++ int i; + +- for (index = 0; len > 0; index++) { +- struct scrub_sector *sector; ++ bio_for_each_bvec_all(bvec, &bbio->bio, i) ++ bio_size += bvec->bv_len; + +- sector = alloc_scrub_sector(sblock, logical); +- if (!sector) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- scrub_block_put(sblock); +- return -ENOMEM; +- } +- sblock->sectors[index] = sector; +- /* For scrub parity */ +- scrub_sector_get(sector); +- list_add_tail(§or->list, &sparity->sectors_list); +- sector->flags = flags; +- sector->generation = gen; +- if (csum) { +- sector->have_csum = 1; +- memcpy(sector->csum, csum, sctx->fs_info->csum_size); +- } else { +- sector->have_csum = 0; +- } ++ if (bbio->bio.bi_status) { ++ unsigned long flags; + +- /* Iterate over the stripe range in sectorsize steps */ +- len -= sectorsize; +- logical += sectorsize; +- physical += sectorsize; ++ spin_lock_irqsave(&stripe->write_error_lock, flags); ++ bitmap_set(&stripe->write_error_bitmap, sector_nr, ++ bio_size >> fs_info->sectorsize_bits); ++ spin_unlock_irqrestore(&stripe->write_error_lock, flags); + } ++ bio_put(&bbio->bio); ++ ++ if (atomic_dec_and_test(&stripe->pending_io)) ++ wake_up(&stripe->io_wait); ++} ++ ++/* ++ * Submit the write bio(s) for the sectors specified by @write_bitmap. ++ * ++ * Here we utilize btrfs_submit_repair_write(), which has some extra benefits: ++ * ++ * - Only needs logical bytenr and mirror_num ++ * Just like the scrub read path ++ * ++ * - Would only result in writes to the specified mirror ++ * Unlike the regular writeback path, which would write back to all stripes ++ * ++ * - Handle dev-replace and read-repair writeback differently ++ */ ++static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, ++ unsigned long write_bitmap, bool dev_replace) ++{ ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ struct btrfs_bio *bbio = NULL; ++ const bool zoned = btrfs_is_zoned(fs_info); ++ int sector_nr; + +- WARN_ON(sblock->sector_count == 0); +- for (index = 0; index < sblock->sector_count; index++) { +- struct scrub_sector *sector = sblock->sectors[index]; ++ for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { ++ struct page *page = scrub_stripe_get_page(stripe, sector_nr); ++ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + int ret; + +- ret = scrub_add_sector_to_rd_bio(sctx, sector); +- if (ret) { +- scrub_block_put(sblock); +- return ret; ++ /* We should only writeback sectors covered by an extent. */ ++ ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); ++ ++ /* Cannot merge with previous sector, submit the current one. */ ++ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { ++ fill_writer_pointer_gap(sctx, stripe->physical + ++ (sector_nr << fs_info->sectorsize_bits)); ++ atomic_inc(&stripe->pending_io); ++ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); ++ /* For zoned writeback, queue depth must be 1. */ ++ if (zoned) ++ wait_scrub_stripe_io(stripe); ++ bbio = NULL; + } ++ if (!bbio) { ++ bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE, ++ fs_info, scrub_write_endio, stripe); ++ bbio->bio.bi_iter.bi_sector = (stripe->logical + ++ (sector_nr << fs_info->sectorsize_bits)) >> ++ SECTOR_SHIFT; ++ } ++ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); ++ ASSERT(ret == fs_info->sectorsize); ++ } ++ if (bbio) { ++ fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector << ++ SECTOR_SHIFT); ++ atomic_inc(&stripe->pending_io); ++ btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace); ++ if (zoned) ++ wait_scrub_stripe_io(stripe); + } +- +- /* Last one frees, either here or in bio completion for last sector */ +- scrub_block_put(sblock); +- return 0; + } + +-static int scrub_extent_for_parity(struct scrub_parity *sparity, +- u64 logical, u32 len, +- u64 physical, struct btrfs_device *dev, +- u64 flags, u64 gen, int mirror_num) ++/* ++ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1 ++ * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max. ++ */ ++static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device, ++ unsigned int bio_size) + { +- struct scrub_ctx *sctx = sparity->sctx; +- int ret; +- u8 csum[BTRFS_CSUM_SIZE]; +- u32 blocksize; ++ const int time_slice = 1000; ++ s64 delta; ++ ktime_t now; ++ u32 div; ++ u64 bwlimit; + +- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { +- scrub_parity_mark_sectors_error(sparity, logical, len); +- return 0; ++ bwlimit = READ_ONCE(device->scrub_speed_max); ++ if (bwlimit == 0) ++ return; ++ ++ /* ++ * Slice is divided into intervals when the IO is submitted, adjust by ++ * bwlimit and maximum of 64 intervals. ++ */ ++ div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024))); ++ div = min_t(u32, 64, div); ++ ++ /* Start new epoch, set deadline */ ++ now = ktime_get(); ++ if (sctx->throttle_deadline == 0) { ++ sctx->throttle_deadline = ktime_add_ms(now, time_slice / div); ++ sctx->throttle_sent = 0; + } + +- if (flags & BTRFS_EXTENT_FLAG_DATA) { +- blocksize = sparity->stripe_len; +- } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { +- blocksize = sparity->stripe_len; ++ /* Still in the time to send? */ ++ if (ktime_before(now, sctx->throttle_deadline)) { ++ /* If current bio is within the limit, send it */ ++ sctx->throttle_sent += bio_size; ++ if (sctx->throttle_sent <= div_u64(bwlimit, div)) ++ return; ++ ++ /* We're over the limit, sleep until the rest of the slice */ ++ delta = ktime_ms_delta(sctx->throttle_deadline, now); + } else { +- blocksize = sctx->fs_info->sectorsize; +- WARN_ON(1); ++ /* New request after deadline, start new epoch */ ++ delta = 0; + } + +- while (len) { +- u32 l = min(len, blocksize); +- int have_csum = 0; ++ if (delta) { ++ long timeout; + +- if (flags & BTRFS_EXTENT_FLAG_DATA) { +- /* push csums to sbio */ +- have_csum = scrub_find_csum(sctx, logical, csum); +- if (have_csum == 0) +- goto skip; +- } +- ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, +- flags, gen, mirror_num, +- have_csum ? csum : NULL); +- if (ret) +- return ret; +-skip: +- len -= l; +- logical += l; +- physical += l; ++ timeout = div_u64(delta * HZ, 1000); ++ schedule_timeout_interruptible(timeout); + } +- return 0; ++ ++ /* Next call will start the deadline period */ ++ sctx->throttle_deadline = 0; + } + + /* +@@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num, + { + int i; + int j = 0; +- u64 stripe_nr; + u64 last_offset; +- u32 stripe_index; +- u32 rot; + const int data_stripes = nr_data_stripes(map); + + last_offset = (physical - map->stripes[num].physical) * data_stripes; +@@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num, + + *offset = last_offset; + for (i = 0; i < data_stripes; i++) { +- *offset = last_offset + i * map->stripe_len; ++ u32 stripe_nr; ++ u32 stripe_index; ++ u32 rot; + +- stripe_nr = div64_u64(*offset, map->stripe_len); +- stripe_nr = div_u64(stripe_nr, data_stripes); ++ *offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT); ++ ++ stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes; + + /* Work out the disk rotation on this stripe-set */ +- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); ++ rot = stripe_nr % map->num_stripes; ++ stripe_nr /= map->num_stripes; + /* calculate which stripe this data locates */ + rot += i; + stripe_index = rot % map->num_stripes; +@@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num, + if (stripe_index < num) + j++; + } +- *offset = last_offset + j * map->stripe_len; ++ *offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT); + return 1; + } + +-static void scrub_free_parity(struct scrub_parity *sparity) +-{ +- struct scrub_ctx *sctx = sparity->sctx; +- struct scrub_sector *curr, *next; +- int nbits; +- +- nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); +- if (nbits) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.read_errors += nbits; +- sctx->stat.uncorrectable_errors += nbits; +- spin_unlock(&sctx->stat_lock); +- } +- +- list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { +- list_del_init(&curr->list); +- scrub_sector_put(curr); +- } +- +- kfree(sparity); +-} +- +-static void scrub_parity_bio_endio_worker(struct work_struct *work) +-{ +- struct scrub_parity *sparity = container_of(work, struct scrub_parity, +- work); +- struct scrub_ctx *sctx = sparity->sctx; +- +- btrfs_bio_counter_dec(sctx->fs_info); +- scrub_free_parity(sparity); +- scrub_pending_bio_dec(sctx); +-} +- +-static void scrub_parity_bio_endio(struct bio *bio) +-{ +- struct scrub_parity *sparity = bio->bi_private; +- struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; +- +- if (bio->bi_status) +- bitmap_or(&sparity->ebitmap, &sparity->ebitmap, +- &sparity->dbitmap, sparity->nsectors); +- +- bio_put(bio); +- +- INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); +- queue_work(fs_info->scrub_parity_workers, &sparity->work); +-} +- +-static void scrub_parity_check_and_repair(struct scrub_parity *sparity) +-{ +- struct scrub_ctx *sctx = sparity->sctx; +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- struct bio *bio; +- struct btrfs_raid_bio *rbio; +- struct btrfs_io_context *bioc = NULL; +- u64 length; +- int ret; +- +- if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, +- &sparity->ebitmap, sparity->nsectors)) +- goto out; +- +- length = sparity->logic_end - sparity->logic_start; +- +- btrfs_bio_counter_inc_blocked(fs_info); +- ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, +- &length, &bioc); +- if (ret || !bioc || !bioc->raid_map) +- goto bioc_out; +- +- bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); +- bio->bi_iter.bi_sector = sparity->logic_start >> 9; +- bio->bi_private = sparity; +- bio->bi_end_io = scrub_parity_bio_endio; +- +- rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, +- sparity->scrub_dev, +- &sparity->dbitmap, +- sparity->nsectors); +- btrfs_put_bioc(bioc); +- if (!rbio) +- goto rbio_out; +- +- scrub_pending_bio_inc(sctx); +- raid56_parity_submit_scrub_rbio(rbio); +- return; +- +-rbio_out: +- bio_put(bio); +-bioc_out: +- btrfs_bio_counter_dec(fs_info); +- bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, +- sparity->nsectors); +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +-out: +- scrub_free_parity(sparity); +-} +- +-static void scrub_parity_get(struct scrub_parity *sparity) +-{ +- refcount_inc(&sparity->refs); +-} +- +-static void scrub_parity_put(struct scrub_parity *sparity) +-{ +- if (!refcount_dec_and_test(&sparity->refs)) +- return; +- +- scrub_parity_check_and_repair(sparity); +-} +- + /* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. +@@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); + } + +-static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, +- u64 boundary_start, u64 boudary_len) ++static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, ++ u64 physical, u64 physical_end) ++{ ++ struct btrfs_fs_info *fs_info = sctx->fs_info; ++ int ret = 0; ++ ++ if (!btrfs_is_zoned(fs_info)) ++ return 0; ++ ++ mutex_lock(&sctx->wr_lock); ++ if (sctx->write_pointer < physical_end) { ++ ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, ++ physical, ++ sctx->write_pointer); ++ if (ret) ++ btrfs_err(fs_info, ++ "zoned: failed to recover write pointer"); ++ } ++ mutex_unlock(&sctx->wr_lock); ++ btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); ++ ++ return ret; ++} ++ ++static void fill_one_extent_info(struct btrfs_fs_info *fs_info, ++ struct scrub_stripe *stripe, ++ u64 extent_start, u64 extent_len, ++ u64 extent_flags, u64 extent_gen) ++{ ++ for (u64 cur_logical = max(stripe->logical, extent_start); ++ cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN, ++ extent_start + extent_len); ++ cur_logical += fs_info->sectorsize) { ++ const int nr_sector = (cur_logical - stripe->logical) >> ++ fs_info->sectorsize_bits; ++ struct scrub_sector_verification *sector = ++ &stripe->sectors[nr_sector]; ++ ++ set_bit(nr_sector, &stripe->extent_sector_bitmap); ++ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ++ sector->is_metadata = true; ++ sector->generation = extent_gen; ++ } ++ } ++} ++ ++static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) ++{ ++ stripe->extent_sector_bitmap = 0; ++ stripe->init_error_bitmap = 0; ++ stripe->error_bitmap = 0; ++ stripe->io_error_bitmap = 0; ++ stripe->csum_error_bitmap = 0; ++ stripe->meta_error_bitmap = 0; ++} ++ ++/* ++ * Locate one stripe which has at least one extent in its range. ++ * ++ * Return 0 if found such stripe, and store its info into @stripe. ++ * Return >0 if there is no such stripe in the specified range. ++ * Return <0 for error. ++ */ ++static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ++ struct btrfs_device *dev, u64 physical, ++ int mirror_num, u64 logical_start, ++ u32 logical_len, ++ struct scrub_stripe *stripe) ++{ ++ struct btrfs_fs_info *fs_info = bg->fs_info; ++ struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); ++ struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); ++ const u64 logical_end = logical_start + logical_len; ++ struct btrfs_path path = { 0 }; ++ u64 cur_logical = logical_start; ++ u64 stripe_end; ++ u64 extent_start; ++ u64 extent_len; ++ u64 extent_flags; ++ u64 extent_gen; ++ int ret; ++ ++ memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * ++ stripe->nr_sectors); ++ scrub_stripe_reset_bitmaps(stripe); ++ ++ /* The range must be inside the bg. */ ++ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); ++ ++ path.search_commit_root = 1; ++ path.skip_locking = 1; ++ ++ ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); ++ /* Either error or not found. */ ++ if (ret) ++ goto out; ++ get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); ++ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ++ stripe->nr_meta_extents++; ++ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) ++ stripe->nr_data_extents++; ++ cur_logical = max(extent_start, cur_logical); ++ ++ /* ++ * Round down to stripe boundary. ++ * ++ * The extra calculation against bg->start is to handle block groups ++ * whose logical bytenr is not BTRFS_STRIPE_LEN aligned. ++ */ ++ stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) + ++ bg->start; ++ stripe->physical = physical + stripe->logical - logical_start; ++ stripe->dev = dev; ++ stripe->bg = bg; ++ stripe->mirror_num = mirror_num; ++ stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1; ++ ++ /* Fill the first extent info into stripe->sectors[] array. */ ++ fill_one_extent_info(fs_info, stripe, extent_start, extent_len, ++ extent_flags, extent_gen); ++ cur_logical = extent_start + extent_len; ++ ++ /* Fill the extent info for the remaining sectors. */ ++ while (cur_logical <= stripe_end) { ++ ret = find_first_extent_item(extent_root, &path, cur_logical, ++ stripe_end - cur_logical + 1); ++ if (ret < 0) ++ goto out; ++ if (ret > 0) { ++ ret = 0; ++ break; ++ } ++ get_extent_info(&path, &extent_start, &extent_len, ++ &extent_flags, &extent_gen); ++ if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) ++ stripe->nr_meta_extents++; ++ if (extent_flags & BTRFS_EXTENT_FLAG_DATA) ++ stripe->nr_data_extents++; ++ fill_one_extent_info(fs_info, stripe, extent_start, extent_len, ++ extent_flags, extent_gen); ++ cur_logical = extent_start + extent_len; ++ } ++ ++ /* Now fill the data csum. */ ++ if (bg->flags & BTRFS_BLOCK_GROUP_DATA) { ++ int sector_nr; ++ unsigned long csum_bitmap = 0; ++ ++ /* Csum space should have already been allocated. */ ++ ASSERT(stripe->csums); ++ ++ /* ++ * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN ++ * should contain at most 16 sectors. ++ */ ++ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); ++ ++ ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, ++ stripe_end, stripe->csums, ++ &csum_bitmap, true); ++ if (ret < 0) ++ goto out; ++ if (ret > 0) ++ ret = 0; ++ ++ for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) { ++ stripe->sectors[sector_nr].csum = stripe->csums + ++ sector_nr * fs_info->csum_size; ++ } ++ } ++ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); ++out: ++ btrfs_release_path(&path); ++ return ret; ++} ++ ++static void scrub_reset_stripe(struct scrub_stripe *stripe) ++{ ++ scrub_stripe_reset_bitmaps(stripe); ++ ++ stripe->nr_meta_extents = 0; ++ stripe->nr_data_extents = 0; ++ stripe->state = 0; ++ ++ for (int i = 0; i < stripe->nr_sectors; i++) { ++ stripe->sectors[i].is_metadata = false; ++ stripe->sectors[i].csum = NULL; ++ stripe->sectors[i].generation = 0; ++ } ++} ++ ++static void scrub_submit_initial_read(struct scrub_ctx *sctx, ++ struct scrub_stripe *stripe) ++{ ++ struct btrfs_fs_info *fs_info = sctx->fs_info; ++ struct btrfs_bio *bbio; ++ int mirror = stripe->mirror_num; ++ ++ ASSERT(stripe->bg); ++ ASSERT(stripe->mirror_num > 0); ++ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); ++ ++ bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, ++ scrub_read_endio, stripe); ++ ++ /* Read the whole stripe. */ ++ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; ++ for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { ++ int ret; ++ ++ ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); ++ /* We should have allocated enough bio vectors. */ ++ ASSERT(ret == PAGE_SIZE); ++ } ++ atomic_inc(&stripe->pending_io); ++ ++ /* ++ * For dev-replace, either user asks to avoid the source dev, or ++ * the device is missing, we try the next mirror instead. ++ */ ++ if (sctx->is_dev_replace && ++ (fs_info->dev_replace.cont_reading_from_srcdev_mode == ++ BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID || ++ !stripe->dev->bdev)) { ++ int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, ++ stripe->bg->length); ++ ++ mirror = calc_next_mirror(mirror, num_copies); ++ } ++ btrfs_submit_bio(bbio, mirror); ++} ++ ++static bool stripe_has_metadata_error(struct scrub_stripe *stripe) + { +- return (extent_start < boundary_start && +- extent_start + extent_len > boundary_start) || +- (extent_start < boundary_start + boudary_len && +- extent_start + extent_len > boundary_start + boudary_len); ++ int i; ++ ++ for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { ++ if (stripe->sectors[i].is_metadata) { ++ struct btrfs_fs_info *fs_info = stripe->bg->fs_info; ++ ++ btrfs_err(fs_info, ++ "stripe %llu has unrepaired metadata sector at %llu", ++ stripe->logical, ++ stripe->logical + (i << fs_info->sectorsize_bits)); ++ return true; ++ } ++ } ++ return false; + } + +-static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, +- struct scrub_parity *sparity, +- struct map_lookup *map, +- struct btrfs_device *sdev, +- struct btrfs_path *path, +- u64 logical) ++static int flush_scrub_stripes(struct scrub_ctx *sctx) + { + struct btrfs_fs_info *fs_info = sctx->fs_info; +- struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); +- struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); +- u64 cur_logical = logical; +- int ret; ++ struct scrub_stripe *stripe; ++ const int nr_stripes = sctx->cur_stripe; ++ int ret = 0; + +- ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); ++ if (!nr_stripes) ++ return 0; + +- /* Path must not be populated */ +- ASSERT(!path->nodes[0]); ++ ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); + +- while (cur_logical < logical + map->stripe_len) { +- struct btrfs_io_context *bioc = NULL; +- struct btrfs_device *extent_dev; +- u64 extent_start; +- u64 extent_size; +- u64 mapped_length; +- u64 extent_flags; +- u64 extent_gen; +- u64 extent_physical; +- u64 extent_mirror_num; +- +- ret = find_first_extent_item(extent_root, path, cur_logical, +- logical + map->stripe_len - cur_logical); +- /* No more extent item in this data stripe */ +- if (ret > 0) { +- ret = 0; +- break; +- } +- if (ret < 0) +- break; +- get_extent_info(path, &extent_start, &extent_size, &extent_flags, +- &extent_gen); ++ scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, ++ nr_stripes << BTRFS_STRIPE_LEN_SHIFT); ++ for (int i = 0; i < nr_stripes; i++) { ++ stripe = &sctx->stripes[i]; ++ scrub_submit_initial_read(sctx, stripe); ++ } + +- /* Metadata should not cross stripe boundaries */ +- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && +- does_range_cross_boundary(extent_start, extent_size, +- logical, map->stripe_len)) { +- btrfs_err(fs_info, +- "scrub: tree block %llu spanning stripes, ignored. logical=%llu", +- extent_start, logical); +- spin_lock(&sctx->stat_lock); +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- cur_logical += extent_size; +- continue; +- } ++ for (int i = 0; i < nr_stripes; i++) { ++ stripe = &sctx->stripes[i]; ++ ++ wait_event(stripe->repair_wait, ++ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); ++ } + +- /* Skip hole range which doesn't have any extent */ +- cur_logical = max(extent_start, cur_logical); ++ /* ++ * Submit the repaired sectors. For zoned case, we cannot do repair ++ * in-place, but queue the bg to be relocated. ++ */ ++ if (btrfs_is_zoned(fs_info)) { ++ for (int i = 0; i < nr_stripes; i++) { ++ stripe = &sctx->stripes[i]; + +- /* Truncate the range inside this data stripe */ +- extent_size = min(extent_start + extent_size, +- logical + map->stripe_len) - cur_logical; +- extent_start = cur_logical; +- ASSERT(extent_size <= U32_MAX); ++ if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { ++ btrfs_repair_one_zone(fs_info, ++ sctx->stripes[0].bg->start); ++ break; ++ } ++ } ++ } else { ++ for (int i = 0; i < nr_stripes; i++) { ++ unsigned long repaired; + +- scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); ++ stripe = &sctx->stripes[i]; + +- mapped_length = extent_size; +- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, +- &mapped_length, &bioc, 0); +- if (!ret && (!bioc || mapped_length < extent_size)) +- ret = -EIO; +- if (ret) { +- btrfs_put_bioc(bioc); +- scrub_parity_mark_sectors_error(sparity, extent_start, +- extent_size); +- break; ++ bitmap_andnot(&repaired, &stripe->init_error_bitmap, ++ &stripe->error_bitmap, stripe->nr_sectors); ++ scrub_write_sectors(sctx, stripe, repaired, false); + } +- extent_physical = bioc->stripes[0].physical; +- extent_mirror_num = bioc->mirror_num; +- extent_dev = bioc->stripes[0].dev; +- btrfs_put_bioc(bioc); ++ } + +- ret = btrfs_lookup_csums_list(csum_root, extent_start, +- extent_start + extent_size - 1, +- &sctx->csum_list, 1, false); +- if (ret) { +- scrub_parity_mark_sectors_error(sparity, extent_start, +- extent_size); +- break; ++ /* Submit for dev-replace. */ ++ if (sctx->is_dev_replace) { ++ /* ++ * For dev-replace, if we know there is something wrong with ++ * metadata, we should immedately abort. ++ */ ++ for (int i = 0; i < nr_stripes; i++) { ++ if (stripe_has_metadata_error(&sctx->stripes[i])) { ++ ret = -EIO; ++ goto out; ++ } + } ++ for (int i = 0; i < nr_stripes; i++) { ++ unsigned long good; + +- ret = scrub_extent_for_parity(sparity, extent_start, +- extent_size, extent_physical, +- extent_dev, extent_flags, +- extent_gen, extent_mirror_num); +- scrub_free_csums(sctx); ++ stripe = &sctx->stripes[i]; + +- if (ret) { +- scrub_parity_mark_sectors_error(sparity, extent_start, +- extent_size); +- break; ++ ASSERT(stripe->dev == fs_info->dev_replace.srcdev); ++ ++ bitmap_andnot(&good, &stripe->extent_sector_bitmap, ++ &stripe->error_bitmap, stripe->nr_sectors); ++ scrub_write_sectors(sctx, stripe, good, true); + } ++ } + +- cond_resched(); +- cur_logical += extent_size; ++ /* Wait for the above writebacks to finish. */ ++ for (int i = 0; i < nr_stripes; i++) { ++ stripe = &sctx->stripes[i]; ++ ++ wait_scrub_stripe_io(stripe); ++ scrub_reset_stripe(stripe); + } +- btrfs_release_path(path); ++out: ++ sctx->cur_stripe = 0; + return ret; + } + +-static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, +- struct map_lookup *map, +- struct btrfs_device *sdev, +- u64 logic_start, +- u64 logic_end) ++static void raid56_scrub_wait_endio(struct bio *bio) + { +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- struct btrfs_path *path; +- u64 cur_logical; ++ complete(bio->bi_private); ++} ++ ++static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, ++ struct btrfs_device *dev, int mirror_num, ++ u64 logical, u32 length, u64 physical) ++{ ++ struct scrub_stripe *stripe; + int ret; +- struct scrub_parity *sparity; +- int nsectors; + +- path = btrfs_alloc_path(); +- if (!path) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- return -ENOMEM; ++ /* No available slot, submit all stripes and wait for them. */ ++ if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { ++ ret = flush_scrub_stripes(sctx); ++ if (ret < 0) ++ return ret; + } +- path->search_commit_root = 1; +- path->skip_locking = 1; + +- ASSERT(map->stripe_len <= U32_MAX); +- nsectors = map->stripe_len >> fs_info->sectorsize_bits; +- ASSERT(nsectors <= BITS_PER_LONG); +- sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); +- if (!sparity) { +- spin_lock(&sctx->stat_lock); +- sctx->stat.malloc_errors++; +- spin_unlock(&sctx->stat_lock); +- btrfs_free_path(path); +- return -ENOMEM; +- } ++ stripe = &sctx->stripes[sctx->cur_stripe]; ++ ++ /* We can queue one stripe using the remaining slot. */ ++ scrub_reset_stripe(stripe); ++ ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, ++ logical, length, stripe); ++ /* Either >0 as no more extents or <0 for error. */ ++ if (ret) ++ return ret; ++ sctx->cur_stripe++; ++ return 0; ++} + +- ASSERT(map->stripe_len <= U32_MAX); +- sparity->stripe_len = map->stripe_len; +- sparity->nsectors = nsectors; +- sparity->sctx = sctx; +- sparity->scrub_dev = sdev; +- sparity->logic_start = logic_start; +- sparity->logic_end = logic_end; +- refcount_set(&sparity->refs, 1); +- INIT_LIST_HEAD(&sparity->sectors_list); ++static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ++ struct btrfs_device *scrub_dev, ++ struct btrfs_block_group *bg, ++ struct map_lookup *map, ++ u64 full_stripe_start) ++{ ++ DECLARE_COMPLETION_ONSTACK(io_done); ++ struct btrfs_fs_info *fs_info = sctx->fs_info; ++ struct btrfs_raid_bio *rbio; ++ struct btrfs_io_context *bioc = NULL; ++ struct bio *bio; ++ struct scrub_stripe *stripe; ++ bool all_empty = true; ++ const int data_stripes = nr_data_stripes(map); ++ unsigned long extent_bitmap = 0; ++ u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT; ++ int ret; + +- ret = 0; +- for (cur_logical = logic_start; cur_logical < logic_end; +- cur_logical += map->stripe_len) { +- ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, +- sdev, path, cur_logical); ++ ASSERT(sctx->raid56_data_stripes); ++ ++ for (int i = 0; i < data_stripes; i++) { ++ int stripe_index; ++ int rot; ++ u64 physical; ++ ++ stripe = &sctx->raid56_data_stripes[i]; ++ rot = div_u64(full_stripe_start - bg->start, ++ data_stripes) >> BTRFS_STRIPE_LEN_SHIFT; ++ stripe_index = (i + rot) % map->num_stripes; ++ physical = map->stripes[stripe_index].physical + ++ (rot << BTRFS_STRIPE_LEN_SHIFT); ++ ++ scrub_reset_stripe(stripe); ++ set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); ++ ret = scrub_find_fill_first_stripe(bg, ++ map->stripes[stripe_index].dev, physical, 1, ++ full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT), ++ BTRFS_STRIPE_LEN, stripe); + if (ret < 0) ++ goto out; ++ /* ++ * No extent in this data stripe, need to manually mark them ++ * initialized to make later read submission happy. ++ */ ++ if (ret > 0) { ++ stripe->logical = full_stripe_start + ++ (i << BTRFS_STRIPE_LEN_SHIFT); ++ stripe->dev = map->stripes[stripe_index].dev; ++ stripe->mirror_num = 1; ++ set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); ++ } ++ } ++ ++ /* Check if all data stripes are empty. */ ++ for (int i = 0; i < data_stripes; i++) { ++ stripe = &sctx->raid56_data_stripes[i]; ++ if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { ++ all_empty = false; + break; ++ } ++ } ++ if (all_empty) { ++ ret = 0; ++ goto out; + } + +- scrub_parity_put(sparity); +- scrub_submit(sctx); +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); ++ for (int i = 0; i < data_stripes; i++) { ++ stripe = &sctx->raid56_data_stripes[i]; ++ scrub_submit_initial_read(sctx, stripe); ++ } ++ for (int i = 0; i < data_stripes; i++) { ++ stripe = &sctx->raid56_data_stripes[i]; + +- btrfs_free_path(path); +- return ret < 0 ? ret : 0; +-} ++ wait_event(stripe->repair_wait, ++ test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); ++ } ++ /* For now, no zoned support for RAID56. */ ++ ASSERT(!btrfs_is_zoned(sctx->fs_info)); + +-static void sync_replace_for_zoned(struct scrub_ctx *sctx) +-{ +- if (!btrfs_is_zoned(sctx->fs_info)) +- return; ++ /* Writeback for the repaired sectors. */ ++ for (int i = 0; i < data_stripes; i++) { ++ unsigned long repaired; + +- sctx->flush_all_writes = true; +- scrub_submit(sctx); +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); ++ stripe = &sctx->raid56_data_stripes[i]; + +- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +-} ++ bitmap_andnot(&repaired, &stripe->init_error_bitmap, ++ &stripe->error_bitmap, stripe->nr_sectors); ++ scrub_write_sectors(sctx, stripe, repaired, false); ++ } + +-static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, +- u64 physical, u64 physical_end) +-{ +- struct btrfs_fs_info *fs_info = sctx->fs_info; +- int ret = 0; ++ /* Wait for the above writebacks to finish. */ ++ for (int i = 0; i < data_stripes; i++) { ++ stripe = &sctx->raid56_data_stripes[i]; + +- if (!btrfs_is_zoned(fs_info)) +- return 0; ++ wait_scrub_stripe_io(stripe); ++ } + +- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); ++ /* ++ * Now all data stripes are properly verified. Check if we have any ++ * unrepaired, if so abort immediately or we could further corrupt the ++ * P/Q stripes. ++ * ++ * During the loop, also populate extent_bitmap. ++ */ ++ for (int i = 0; i < data_stripes; i++) { ++ unsigned long error; + +- mutex_lock(&sctx->wr_lock); +- if (sctx->write_pointer < physical_end) { +- ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical, +- physical, +- sctx->write_pointer); +- if (ret) ++ stripe = &sctx->raid56_data_stripes[i]; ++ ++ /* ++ * We should only check the errors where there is an extent. ++ * As we may hit an empty data stripe while it's missing. ++ */ ++ bitmap_and(&error, &stripe->error_bitmap, ++ &stripe->extent_sector_bitmap, stripe->nr_sectors); ++ if (!bitmap_empty(&error, stripe->nr_sectors)) { + btrfs_err(fs_info, +- "zoned: failed to recover write pointer"); ++"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", ++ full_stripe_start, i, stripe->nr_sectors, ++ &error); ++ ret = -EIO; ++ goto out; ++ } ++ bitmap_or(&extent_bitmap, &extent_bitmap, ++ &stripe->extent_sector_bitmap, stripe->nr_sectors); + } +- mutex_unlock(&sctx->wr_lock); +- btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical); + ++ /* Now we can check and regenerate the P/Q stripe. */ ++ bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS); ++ bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT; ++ bio->bi_private = &io_done; ++ bio->bi_end_io = raid56_scrub_wait_endio; ++ ++ btrfs_bio_counter_inc_blocked(fs_info); ++ ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start, ++ &length, &bioc); ++ if (ret < 0) { ++ btrfs_put_bioc(bioc); ++ btrfs_bio_counter_dec(fs_info); ++ goto out; ++ } ++ rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap, ++ BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); ++ btrfs_put_bioc(bioc); ++ if (!rbio) { ++ ret = -ENOMEM; ++ btrfs_bio_counter_dec(fs_info); ++ goto out; ++ } ++ raid56_parity_submit_scrub_rbio(rbio); ++ wait_for_completion_io(&io_done); ++ ret = blk_status_to_errno(bio->bi_status); ++ bio_put(bio); ++ btrfs_bio_counter_dec(fs_info); ++ ++out: + return ret; + } + +@@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, + * and @logical_length parameter. + */ + static int scrub_simple_mirror(struct scrub_ctx *sctx, +- struct btrfs_root *extent_root, +- struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, +@@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + /* An artificial limit, inherit from old scrub behavior */ +- const u32 max_length = SZ_64K; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + int ret; +@@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, + path.skip_locking = 1; + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { +- u64 extent_start; +- u64 extent_len; +- u64 extent_flags; +- u64 extent_gen; +- u64 scrub_len; ++ u64 cur_physical = physical + cur_logical - logical_start; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || +@@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ +- sctx->flush_all_writes = true; +- scrub_submit(sctx); +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); +- wait_event(sctx->list_wait, +- atomic_read(&sctx->bios_in_flight) == 0); +- sctx->flush_all_writes = false; + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ +@@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, + } + spin_unlock(&bg->lock); + +- ret = find_first_extent_item(extent_root, &path, cur_logical, +- logical_end - cur_logical); ++ ret = queue_scrub_stripe(sctx, bg, device, mirror_num, ++ cur_logical, logical_end - cur_logical, ++ cur_physical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; +@@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, + } + if (ret < 0) + break; +- get_extent_info(&path, &extent_start, &extent_len, +- &extent_flags, &extent_gen); +- /* Skip hole range which doesn't have any extent */ +- cur_logical = max(extent_start, cur_logical); + +- /* +- * Scrub len has three limits: +- * - Extent size limit +- * - Scrub range limit +- * This is especially imporatant for RAID0/RAID10 to reuse +- * this function +- * - Max scrub size limit +- */ +- scrub_len = min(min(extent_start + extent_len, +- logical_end), cur_logical + max_length) - +- cur_logical; +- +- if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { +- ret = btrfs_lookup_csums_list(csum_root, cur_logical, +- cur_logical + scrub_len - 1, +- &sctx->csum_list, 1, false); +- if (ret) +- break; +- } +- if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && +- does_range_cross_boundary(extent_start, extent_len, +- logical_start, logical_length)) { +- btrfs_err(fs_info, +-"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", +- extent_start, logical_start, logical_end); +- spin_lock(&sctx->stat_lock); +- sctx->stat.uncorrectable_errors++; +- spin_unlock(&sctx->stat_lock); +- cur_logical += scrub_len; +- continue; +- } +- ret = scrub_extent(sctx, map, cur_logical, scrub_len, +- cur_logical - logical_start + physical, +- device, extent_flags, extent_gen, +- mirror_num); +- scrub_free_csums(sctx); +- if (ret) +- break; +- if (sctx->is_dev_replace) +- sync_replace_for_zoned(sctx); +- cur_logical += scrub_len; ++ ASSERT(sctx->cur_stripe > 0); ++ cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical ++ + BTRFS_STRIPE_LEN; ++ + /* Don't hold CPU for too long time */ + cond_resched(); + } +@@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + +- return map->num_stripes / map->sub_stripes * map->stripe_len; ++ return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + } + + /* Get the logical bytenr for the stripe */ +@@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ +- return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; ++ return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) + ++ bg->start; + } + + /* Get the mirror number for the stripe */ +@@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) + } + + static int scrub_simple_stripe(struct scrub_ctx *sctx, +- struct btrfs_root *extent_root, +- struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, +@@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ +- ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, +- cur_logical, map->stripe_len, device, +- cur_physical, mirror_num); ++ ret = scrub_simple_mirror(sctx, bg, map, cur_logical, ++ BTRFS_STRIPE_LEN, device, cur_physical, ++ mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ +- cur_physical += map->stripe_len; ++ cur_physical += BTRFS_STRIPE_LEN; + } + return ret; + } +@@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev, + int stripe_index) + { +- struct btrfs_path *path; + struct btrfs_fs_info *fs_info = sctx->fs_info; +- struct btrfs_root *root; +- struct btrfs_root *csum_root; +- struct blk_plug plug; + struct map_lookup *map = em->map_lookup; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + const u64 chunk_logical = bg->start; + int ret; ++ int ret2; + u64 physical = map->stripes[stripe_index].physical; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; +@@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + /* Offset inside the chunk */ + u64 offset; + u64 stripe_logical; +- u64 stripe_end; + int stop_loop = 0; + +- path = btrfs_alloc_path(); +- if (!path) +- return -ENOMEM; +- +- /* +- * work on commit root. The related disk blocks are static as +- * long as COW is applied. This means, it is save to rewrite +- * them to repair disk errors without any race conditions +- */ +- path->search_commit_root = 1; +- path->skip_locking = 1; +- path->reada = READA_FORWARD; +- +- wait_event(sctx->list_wait, +- atomic_read(&sctx->bios_in_flight) == 0); + scrub_blocked_if_needed(fs_info); + +- root = btrfs_extent_root(fs_info, bg->start); +- csum_root = btrfs_csum_root(fs_info, bg->start); +- +- /* +- * collect all data csums for the stripe to avoid seeking during +- * the scrub. This might currently (crc32) end up to be about 1MB +- */ +- blk_start_plug(&plug); +- + if (sctx->is_dev_replace && + btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) { + mutex_lock(&sctx->wr_lock); + sctx->write_pointer = physical; + mutex_unlock(&sctx->wr_lock); +- sctx->flush_all_writes = true; + } + ++ /* Prepare the extra data stripes used by RAID56. */ ++ if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) { ++ ASSERT(sctx->raid56_data_stripes == NULL); ++ ++ sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map), ++ sizeof(struct scrub_stripe), ++ GFP_KERNEL); ++ if (!sctx->raid56_data_stripes) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ for (int i = 0; i < nr_data_stripes(map); i++) { ++ ret = init_scrub_stripe(fs_info, ++ &sctx->raid56_data_stripes[i]); ++ if (ret < 0) ++ goto out; ++ sctx->raid56_data_stripes[i].bg = bg; ++ sctx->raid56_data_stripes[i].sctx = sctx; ++ } ++ } + /* + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. +@@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. + */ +- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, +- bg->start, bg->length, scrub_dev, +- map->stripes[stripe_index].physical, ++ ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length, ++ scrub_dev, map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { +- ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, +- scrub_dev, stripe_index); +- offset = map->stripe_len * (stripe_index / map->sub_stripes); ++ ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index); ++ offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT; + goto out; + } + +@@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); +- increment = map->stripe_len * nr_data_stripes(map); ++ increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe +@@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; +- stripe_end = stripe_logical + increment; +- ret = scrub_raid56_parity(sctx, map, scrub_dev, +- stripe_logical, +- stripe_end); ++ ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg, ++ map, stripe_logical); + if (ret) + goto out; + goto next; +@@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ +- ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, +- logical, map->stripe_len, ++ ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN, + scrub_dev, physical, 1); + if (ret < 0) + goto out; + next: + logical += increment; +- physical += map->stripe_len; ++ physical += BTRFS_STRIPE_LEN; + spin_lock(&sctx->stat_lock); + if (stop_loop) + sctx->stat.last_physical = +@@ -3754,14 +2237,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, + break; + } + out: +- /* push queued extents */ +- scrub_submit(sctx); +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); +- +- blk_finish_plug(&plug); +- btrfs_free_path(path); ++ ret2 = flush_scrub_stripes(sctx); ++ if (!ret2) ++ ret = ret2; ++ if (sctx->raid56_data_stripes) { ++ for (int i = 0; i < nr_data_stripes(map); i++) ++ release_scrub_stripe(&sctx->raid56_data_stripes[i]); ++ kfree(sctx->raid56_data_stripes); ++ sctx->raid56_data_stripes = NULL; ++ } + + if (sctx->is_dev_replace && ret >= 0) { + int ret2; +@@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, + + ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset, + dev_extent_len); +- +- /* +- * flush, submit all pending read and write bios, afterwards +- * wait for them. +- * Note that in the dev replace case, a read request causes +- * write requests that are submitted in the read completion +- * worker. Therefore in the current situation, it is required +- * that all write requests are flushed, so that all read and +- * write requests are really completed when bios_in_flight +- * changes to 0. +- */ +- sctx->flush_all_writes = true; +- scrub_submit(sctx); +- mutex_lock(&sctx->wr_lock); +- scrub_wr_submit(sctx); +- mutex_unlock(&sctx->wr_lock); +- +- wait_event(sctx->list_wait, +- atomic_read(&sctx->bios_in_flight) == 0); +- +- scrub_pause_on(fs_info); +- +- /* +- * must be called before we decrease @scrub_paused. +- * make sure we don't block transaction commit while +- * we are waiting pending workers finished. +- */ +- wait_event(sctx->list_wait, +- atomic_read(&sctx->workers_pending) == 0); +- sctx->flush_all_writes = false; +- +- scrub_pause_off(fs_info); +- + if (sctx->is_dev_replace && + !btrfs_finish_block_group_to_copy(dev_replace->srcdev, + cache, found_key.offset)) +@@ -4168,18 +2619,62 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, + return ret; + } + ++static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, ++ struct page *page, u64 physical, u64 generation) ++{ ++ struct btrfs_fs_info *fs_info = sctx->fs_info; ++ struct bio_vec bvec; ++ struct bio bio; ++ struct btrfs_super_block *sb = page_address(page); ++ int ret; ++ ++ bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); ++ bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; ++ __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); ++ ret = submit_bio_wait(&bio); ++ bio_uninit(&bio); ++ ++ if (ret < 0) ++ return ret; ++ ret = btrfs_check_super_csum(fs_info, sb); ++ if (ret != 0) { ++ btrfs_err_rl(fs_info, ++ "super block at physical %llu devid %llu has bad csum", ++ physical, dev->devid); ++ return -EIO; ++ } ++ if (btrfs_super_generation(sb) != generation) { ++ btrfs_err_rl(fs_info, ++"super block at physical %llu devid %llu has bad generation %llu expect %llu", ++ physical, dev->devid, ++ btrfs_super_generation(sb), generation); ++ return -EUCLEAN; ++ } ++ ++ return btrfs_validate_super(fs_info, sb, -1); ++} ++ + static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + struct btrfs_device *scrub_dev) + { + int i; + u64 bytenr; + u64 gen; +- int ret; ++ int ret = 0; ++ struct page *page; + struct btrfs_fs_info *fs_info = sctx->fs_info; + + if (BTRFS_FS_ERROR(fs_info)) + return -EROFS; + ++ page = alloc_page(GFP_KERNEL); ++ if (!page) { ++ spin_lock(&sctx->stat_lock); ++ sctx->stat.malloc_errors++; ++ spin_unlock(&sctx->stat_lock); ++ return -ENOMEM; ++ } ++ + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != fs_info->fs_devices) + gen = scrub_dev->generation; +@@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, + if (!btrfs_check_super_location(scrub_dev, bytenr)) + continue; + +- ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, +- scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, +- NULL, bytenr); +- if (ret) +- return ret; ++ ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen); ++ if (ret) { ++ spin_lock(&sctx->stat_lock); ++ sctx->stat.super_errors++; ++ spin_unlock(&sctx->stat_lock); ++ } + } +- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); +- ++ __free_page(page); + return 0; + } + +@@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + struct workqueue_struct *scrub_wr_comp = + fs_info->scrub_wr_completion_workers; +- struct workqueue_struct *scrub_parity = +- fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; +- fs_info->scrub_parity_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + if (scrub_workers) + destroy_workqueue(scrub_workers); + if (scrub_wr_comp) + destroy_workqueue(scrub_wr_comp); +- if (scrub_parity) +- destroy_workqueue(scrub_parity); + } + } + +@@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + { + struct workqueue_struct *scrub_workers = NULL; + struct workqueue_struct *scrub_wr_comp = NULL; +- struct workqueue_struct *scrub_parity = NULL; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; + int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; +@@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + if (!scrub_wr_comp) + goto fail_scrub_wr_completion_workers; + +- scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); +- if (!scrub_parity) +- goto fail_scrub_parity_workers; +- + mutex_lock(&fs_info->scrub_lock); + if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { + ASSERT(fs_info->scrub_workers == NULL && +- fs_info->scrub_wr_completion_workers == NULL && +- fs_info->scrub_parity_workers == NULL); ++ fs_info->scrub_wr_completion_workers == NULL); + fs_info->scrub_workers = scrub_workers; + fs_info->scrub_wr_completion_workers = scrub_wr_comp; +- fs_info->scrub_parity_workers = scrub_parity; + refcount_set(&fs_info->scrub_workers_refcnt, 1); + mutex_unlock(&fs_info->scrub_lock); + return 0; +@@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, + mutex_unlock(&fs_info->scrub_lock); + + ret = 0; +- destroy_workqueue(scrub_parity); +-fail_scrub_parity_workers: ++ + destroy_workqueue(scrub_wr_comp); + fail_scrub_wr_completion_workers: + destroy_workqueue(scrub_workers); +@@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + ret = scrub_enumerate_chunks(sctx, dev, start, end); + memalloc_nofs_restore(nofs_flag); + +- wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); + atomic_dec(&fs_info->scrubs_running); + wake_up(&fs_info->scrub_pause_wait); + +- wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); +- + if (progress) + memcpy(progress, &sctx->stat, sizeof(*progress)); + +@@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + + return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; + } +- +-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, +- u64 extent_logical, u32 extent_len, +- u64 *extent_physical, +- struct btrfs_device **extent_dev, +- int *extent_mirror_num) +-{ +- u64 mapped_length; +- struct btrfs_io_context *bioc = NULL; +- int ret; +- +- mapped_length = extent_len; +- ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, +- &mapped_length, &bioc, 0); +- if (ret || !bioc || mapped_length < extent_len || +- !bioc->stripes[0].dev->bdev) { +- btrfs_put_bioc(bioc); +- return; +- } +- +- *extent_physical = bioc->stripes[0].physical; +- *extent_mirror_num = bioc->mirror_num; +- *extent_dev = bioc->stripes[0].dev; +- btrfs_put_bioc(bioc); +-} +diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c +index e5c963bb873d..af2e153543a5 100644 +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, + int left_ret; + int right_ret; + u64 left_gen; +- u64 right_gen; ++ u64 right_gen = 0; + struct btrfs_inode_info info; + + ret = get_inode_info(sctx->send_root, ino, &info); +diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c +index 3eecce86f63f..75e7fa337e66 100644 +--- a/fs/btrfs/space-info.c ++++ b/fs/btrfs/space-info.c +@@ -537,7 +537,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, + up_read(&info->groups_sem); + } + +-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, ++static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, + u64 to_reclaim) + { + u64 bytes; +@@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + return nr; + } + ++static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, ++ u64 to_reclaim) ++{ ++ const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); ++ u64 nr; ++ ++ nr = div64_u64(to_reclaim, bytes); ++ if (!nr) ++ nr = 1; ++ return nr; ++} ++ + #define EXTENT_SIZE_PER_ITEM SZ_256K + + /* +@@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, + break; + } + if (state == FLUSH_DELAYED_REFS_NR) +- nr = calc_reclaim_items_nr(fs_info, num_bytes); ++ nr = calc_delayed_refs_nr(fs_info, num_bytes); + else + nr = 0; + btrfs_run_delayed_refs(trans, nr); +@@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct reserve_ticket ticket; + u64 start_ns = 0; + u64 used; +- int ret = 0; ++ int ret = -ENOSPC; + bool pending_tickets; + + ASSERT(orig_bytes); +- ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); ++ /* ++ * If have a transaction handle (current->journal_info != NULL), then ++ * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor ++ * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those ++ * flushing methods can trigger transaction commits. ++ */ ++ if (current->journal_info) { ++ /* One assert per line for easier debugging. */ ++ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL); ++ ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL); ++ ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT); ++ } + + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; +@@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, + async_work = &fs_info->async_reclaim_work; + + spin_lock(&space_info->lock); +- ret = -ENOSPC; + used = btrfs_space_info_used(space_info, true); + + /* +diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h +index 2033b71b18ce..0bb9d14e60a8 100644 +--- a/fs/btrfs/space-info.h ++++ b/fs/btrfs/space-info.h +@@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum { + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk ++ * - Committing transaction + */ + BTRFS_RESERVE_FLUSH_EVICT, + +diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c +index 366fb4cde145..6cb97efee976 100644 +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -1158,6 +1158,7 @@ static int btrfs_fill_super(struct super_block *sb, + inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); ++ btrfs_handle_fs_error(fs_info, err, NULL); + goto fail_close; + } + +@@ -2412,7 +2413,7 @@ static int __init btrfs_print_mod_info(void) + ", fsverity=no" + #endif + ; +- pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); ++ pr_info("Btrfs loaded%s\n", options); + return 0; + } + +diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c +index 37fc58a7f27e..25294e624851 100644 +--- a/fs/btrfs/sysfs.c ++++ b/fs/btrfs/sysfs.c +@@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, + if (ret) + return ret; + ++#ifdef CONFIG_BTRFS_DEBUG ++ if (thresh != 0 && (thresh > 100)) ++ return -EINVAL; ++#else + if (thresh != 0 && (thresh <= 50 || thresh > 100)) + return -EINVAL; ++#endif + + WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); + +diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c +index f2f2e11dac4c..ed0f36ae5346 100644 +--- a/fs/btrfs/tests/extent-map-tests.c ++++ b/fs/btrfs/tests/extent-map-tests.c +@@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, + em->map_lookup = map; + + map->num_stripes = test->num_stripes; +- map->stripe_len = BTRFS_STRIPE_LEN; + map->type = test->raid_type; + + for (i = 0; i < map->num_stripes; i++) { +diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c +index b8d5b1fa9a03..8b6a99b8d7f6 100644 +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, + /* + * We want to reserve all the bytes we may need all at once, so + * we only do 1 enospc flushing cycle per transaction start. We +- * accomplish this by simply assuming we'll do 2 x num_items +- * worth of delayed refs updates in this trans handle, and +- * refill that amount for whatever is missing in the reserve. ++ * accomplish this by simply assuming we'll do num_items worth ++ * of delayed refs updates in this trans handle, and refill that ++ * amount for whatever is missing in the reserve. + */ + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); + if (flush == BTRFS_RESERVE_FLUSH_ALL && +- btrfs_block_rsv_full(delayed_refs_rsv) == 0) { +- delayed_refs_bytes = num_bytes; +- num_bytes <<= 1; ++ !btrfs_block_rsv_full(delayed_refs_rsv)) { ++ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, ++ num_items); ++ num_bytes += delayed_refs_bytes; + } + + /* +@@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info) + wait_current_trans(fs_info); + } + +-static bool should_end_transaction(struct btrfs_trans_handle *trans) +-{ +- struct btrfs_fs_info *fs_info = trans->fs_info; +- +- if (btrfs_check_space_for_delayed_refs(fs_info)) +- return true; +- +- return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); +-} +- + bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) + { + struct btrfs_transaction *cur_trans = trans->transaction; +@@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) + test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) + return true; + +- return should_end_transaction(trans); ++ if (btrfs_check_space_for_delayed_refs(trans->fs_info)) ++ return true; ++ ++ return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); + } + + static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) +diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c +index baad1ed7e111..e2b54793bf0c 100644 +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, + stripe_len); + return -EUCLEAN; + } ++ /* ++ * We artificially limit the chunk size, so that the number of stripes ++ * inside a chunk can be fit into a U32. The current limit (256G) is ++ * way too large for real world usage anyway, and it's also much larger ++ * than our existing limit (10G). ++ * ++ * Thus it should be a good way to catch obvious bitflips. ++ */ ++ if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) { ++ chunk_err(leaf, chunk, logical, ++ "chunk length too large: have %llu limit %llu", ++ length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT); ++ return -EUCLEAN; ++ } + if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK))) { + chunk_err(leaf, chunk, logical, +diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c +index 200cea6e49e5..9b212e8c70cc 100644 +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) + btrfs_put_block_group(cache); + } + ++static int clean_log_buffer(struct btrfs_trans_handle *trans, ++ struct extent_buffer *eb) ++{ ++ int ret; ++ ++ btrfs_tree_lock(eb); ++ btrfs_clear_buffer_dirty(trans, eb); ++ wait_on_extent_buffer_writeback(eb); ++ btrfs_tree_unlock(eb); ++ ++ if (trans) { ++ ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); ++ if (ret) ++ return ret; ++ btrfs_redirty_list_add(trans->transaction, eb); ++ } else { ++ unaccount_log_buffer(eb->fs_info, eb->start); ++ } ++ ++ return 0; ++} ++ + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, +@@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; +- u32 blocksize; + int ret = 0; + + while (*level > 0) { +@@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); +- blocksize = fs_info->nodesize; + + next = btrfs_find_create_tree_block(fs_info, bytenr, + btrfs_header_owner(cur), +@@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + return ret; + } + +- btrfs_tree_lock(next); +- btrfs_clear_buffer_dirty(trans, next); +- wait_on_extent_buffer_writeback(next); +- btrfs_tree_unlock(next); +- +- if (trans) { +- ret = btrfs_pin_reserved_extent(trans, +- bytenr, blocksize); +- if (ret) { +- free_extent_buffer(next); +- return ret; +- } +- btrfs_redirty_list_add( +- trans->transaction, next); +- } else { +- unaccount_log_buffer(fs_info, bytenr); ++ ret = clean_log_buffer(trans, next); ++ if (ret) { ++ free_extent_buffer(next); ++ return ret; + } + } + free_extent_buffer(next); +@@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_path *path, int *level, + struct walk_control *wc) + { +- struct btrfs_fs_info *fs_info = root->fs_info; + int i; + int slot; + int ret; +@@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + return ret; + + if (wc->free) { +- struct extent_buffer *next; +- +- next = path->nodes[*level]; +- +- btrfs_tree_lock(next); +- btrfs_clear_buffer_dirty(trans, next); +- wait_on_extent_buffer_writeback(next); +- btrfs_tree_unlock(next); +- +- if (trans) { +- ret = btrfs_pin_reserved_extent(trans, +- path->nodes[*level]->start, +- path->nodes[*level]->len); +- if (ret) +- return ret; +- btrfs_redirty_list_add(trans->transaction, +- next); +- } else { +- unaccount_log_buffer(fs_info, +- path->nodes[*level]->start); +- } ++ ret = clean_log_buffer(trans, path->nodes[*level]); ++ if (ret) ++ return ret; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; +@@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) + { +- struct btrfs_fs_info *fs_info = log->fs_info; + int ret = 0; + int wret; + int level; +@@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, + orig_level); + if (ret) + goto out; +- if (wc->free) { +- struct extent_buffer *next; +- +- next = path->nodes[orig_level]; +- +- btrfs_tree_lock(next); +- btrfs_clear_buffer_dirty(trans, next); +- wait_on_extent_buffer_writeback(next); +- btrfs_tree_unlock(next); +- +- if (trans) { +- ret = btrfs_pin_reserved_extent(trans, +- next->start, next->len); +- if (ret) +- goto out; +- btrfs_redirty_list_add(trans->transaction, next); +- } else { +- unaccount_log_buffer(fs_info, next->start); +- } +- } ++ if (wc->free) ++ ret = clean_log_buffer(trans, path->nodes[orig_level]); + } + + out: +@@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + ret = BTRFS_LOG_FORCE_COMMIT; + else + inode->last_dir_index_offset = last_index; ++ ++ if (btrfs_get_first_dir_index_to_log(inode) == 0) ++ btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset); + out: + kfree(ins_data); + +@@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans, + + found_key.offset = 0; + found_key.type = 0; +- ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot); ++ ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot); + if (ret < 0) + break; + +@@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + u64 ino = btrfs_ino(start_inode); ++ struct btrfs_inode *curr_inode = start_inode; + int ret = 0; + + /* +@@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + if (!path) + return -ENOMEM; + ++ /* Pairs with btrfs_add_delayed_iput below. */ ++ ihold(&curr_inode->vfs_inode); ++ + while (true) { +- struct extent_buffer *leaf; +- struct btrfs_key min_key; ++ struct inode *vfs_inode; ++ struct btrfs_key key; ++ struct btrfs_key found_key; ++ u64 next_index; + bool continue_curr_inode = true; +- int nritems; +- int i; ++ int iter_ret; + +- min_key.objectid = ino; +- min_key.type = BTRFS_DIR_INDEX_KEY; +- min_key.offset = 0; ++ key.objectid = ino; ++ key.type = BTRFS_DIR_INDEX_KEY; ++ key.offset = btrfs_get_first_dir_index_to_log(curr_inode); ++ next_index = key.offset; + again: +- btrfs_release_path(path); +- ret = btrfs_search_forward(root, &min_key, path, trans->transid); +- if (ret < 0) { +- break; +- } else if (ret > 0) { +- ret = 0; +- goto next; +- } +- +- leaf = path->nodes[0]; +- nritems = btrfs_header_nritems(leaf); +- for (i = path->slots[0]; i < nritems; i++) { ++ btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) { ++ struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + int log_mode = LOG_INODE_EXISTS; + int type; + +- btrfs_item_key_to_cpu(leaf, &min_key, i); +- if (min_key.objectid != ino || +- min_key.type != BTRFS_DIR_INDEX_KEY) { ++ if (found_key.objectid != ino || ++ found_key.type != BTRFS_DIR_INDEX_KEY) { + continue_curr_inode = false; + break; + } + +- di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); ++ next_index = found_key.offset + 1; ++ ++ di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + type = btrfs_dir_ftype(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid) + continue; +@@ -5496,12 +5466,24 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + break; + } + +- if (continue_curr_inode && min_key.offset < (u64)-1) { +- min_key.offset++; ++ btrfs_release_path(path); ++ ++ if (iter_ret < 0) { ++ ret = iter_ret; ++ goto out; ++ } else if (iter_ret > 0) { ++ continue_curr_inode = false; ++ } else { ++ key = found_key; ++ } ++ ++ if (continue_curr_inode && key.offset < (u64)-1) { ++ key.offset++; + goto again; + } + +-next: ++ btrfs_set_first_dir_index_to_log(curr_inode, next_index); ++ + if (list_empty(&dir_list)) + break; + +@@ -5509,9 +5491,22 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + ino = dir_elem->ino; + list_del(&dir_elem->list); + kfree(dir_elem); ++ ++ btrfs_add_delayed_iput(curr_inode); ++ curr_inode = NULL; ++ ++ vfs_inode = btrfs_iget(fs_info->sb, ino, root); ++ if (IS_ERR(vfs_inode)) { ++ ret = PTR_ERR(vfs_inode); ++ break; ++ } ++ curr_inode = BTRFS_I(vfs_inode); + } + out: + btrfs_free_path(path); ++ if (curr_inode) ++ btrfs_add_delayed_iput(curr_inode); ++ + if (ret) { + struct btrfs_dir_list *next; + +diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c +index c6d592870400..03f52e4a20aa 100644 +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device) + { + WARN_ON(!list_empty(&device->post_commit_list)); + rcu_string_free(device->name); +- extent_io_tree_release(&device->alloc_state); + btrfs_destroy_dev_zone_info(device); + kfree(device); + } +@@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) + device->last_flush_error = 0; + + /* Verify the device is back in a pristine state */ +- ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); +- ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); +- ASSERT(list_empty(&device->dev_alloc_list)); +- ASSERT(list_empty(&device->post_commit_list)); ++ WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); ++ WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); ++ WARN_ON(!list_empty(&device->dev_alloc_list)); ++ WARN_ON(!list_empty(&device->post_commit_list)); + } + + static void close_fs_devices(struct btrfs_fs_devices *fs_devices) +@@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path + struct block_device *bdev; + struct super_block *sb = fs_info->sb; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; +- struct btrfs_fs_devices *seed_devices; ++ struct btrfs_fs_devices *seed_devices = NULL; + u64 orig_super_total_bytes; + u64 orig_super_num_devices; + int ret = 0; +@@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular( + /* We don't want a chunk larger than 10% of writable space */ + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), + ctl->max_chunk_size); +- ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; ++ ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT; + } + + static void init_alloc_chunk_ctl_policy_zoned( +@@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, + j * ctl->stripe_size; + } + } +- map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->type = type; +@@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, + } + write_unlock(&em_tree->lock); + +- block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size); ++ block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + if (IS_ERR(block_group)) + goto error_del_extent; + +@@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, + + btrfs_set_stack_chunk_length(chunk, bg->length); + btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID); +- btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); ++ btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); +- btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); +- btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); ++ btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN); ++ btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN); + btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + +@@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) + */ + ret = map->num_stripes; + free_extent_map(em); +- +- down_read(&fs_info->dev_replace.rwsem); +- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && +- fs_info->dev_replace.tgtdev) +- ret++; +- up_read(&fs_info->dev_replace.rwsem); +- + return ret; + } + +@@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, + if (!WARN_ON(IS_ERR(em))) { + map = em->map_lookup; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) +- len = map->stripe_len * nr_data_stripes(map); ++ len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT; + free_extent_map(em); + } + return len; +@@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, + return preferred_mirror; + } + +-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */ +-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) +-{ +- int i; +- int again = 1; +- +- while (again) { +- again = 0; +- for (i = 0; i < num_stripes - 1; i++) { +- /* Swap if parity is on a smaller index */ +- if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { +- swap(bioc->stripes[i], bioc->stripes[i + 1]); +- swap(bioc->raid_map[i], bioc->raid_map[i + 1]); +- again = 1; +- } +- } +- } +-} +- + static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, +- int total_stripes, +- int real_stripes) ++ u16 total_stripes) + { +- struct btrfs_io_context *bioc = kzalloc( ++ struct btrfs_io_context *bioc; ++ ++ bioc = kzalloc( + /* The size of btrfs_io_context */ + sizeof(struct btrfs_io_context) + + /* Plus the variable array for the stripes */ +- sizeof(struct btrfs_io_stripe) * (total_stripes) + +- /* Plus the variable array for the tgt dev */ +- sizeof(int) * (real_stripes) + +- /* +- * Plus the raid_map, which includes both the tgt dev +- * and the stripes. +- */ +- sizeof(u64) * (total_stripes), ++ sizeof(struct btrfs_io_stripe) * (total_stripes), + GFP_NOFS); + + if (!bioc) +@@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ + refcount_set(&bioc->refs, 1); + + bioc->fs_info = fs_info; +- bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); +- bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); ++ bioc->replace_stripe_src = -1; ++ bioc->full_stripe_logical = (u64)-1; + + return bioc; + } +@@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + struct btrfs_discard_stripe *stripes; + u64 length = *length_ret; + u64 offset; +- u64 stripe_nr; +- u64 stripe_nr_end; ++ u32 stripe_nr; ++ u32 stripe_nr_end; ++ u32 stripe_cnt; + u64 stripe_end_offset; +- u64 stripe_cnt; +- u64 stripe_len; + u64 stripe_offset; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; +- u64 stripes_per_dev = 0; ++ u32 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret; +@@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out_free_map; +-} ++ } + + offset = logical - em->start; + length = min_t(u64, em->start + em->len - logical, length); + *length_ret = length; + +- stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ +- stripe_nr = div64_u64(offset, stripe_len); ++ stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + + /* stripe_offset is the offset of this block in its stripe */ +- stripe_offset = offset - stripe_nr * stripe_len; ++ stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + +- stripe_nr_end = round_up(offset + length, map->stripe_len); +- stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); ++ stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >> ++ BTRFS_STRIPE_LEN_SHIFT; + stripe_cnt = stripe_nr_end - stripe_nr; +- stripe_end_offset = stripe_nr_end * map->stripe_len - ++ stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this +@@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + factor = map->num_stripes / sub_stripes; + *num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); +- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); ++ stripe_index = stripe_nr % factor; ++ stripe_nr /= factor; + stripe_index *= sub_stripes; +- stripes_per_dev = div_u64_rem(stripe_cnt, factor, +- &remaining_stripes); +- div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); +- last_stripe *= sub_stripes; ++ ++ remaining_stripes = stripe_cnt % factor; ++ stripes_per_dev = stripe_cnt / factor; ++ last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | + BTRFS_BLOCK_GROUP_DUP)) { + *num_stripes = map->num_stripes; + } else { +- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, +- &stripe_index); ++ stripe_index = stripe_nr % map->num_stripes; ++ stripe_nr /= map->num_stripes; + } + + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); +@@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = + map->stripes[stripe_index].physical + +- stripe_offset + stripe_nr * map->stripe_len; ++ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { +- stripes[i].length = stripes_per_dev * map->stripe_len; ++ stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT; + + if (i / sub_stripes < remaining_stripes) +- stripes[i].length += map->stripe_len; ++ stripes[i].length += BTRFS_STRIPE_LEN; + + /* + * Special for the first stripe and +@@ -6103,83 +6068,6 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + return ERR_PTR(ret); + } + +-/* +- * In dev-replace case, for repair case (that's the only case where the mirror +- * is selected explicitly when calling btrfs_map_block), blocks left of the +- * left cursor can also be read from the target drive. +- * +- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the +- * array of stripes. +- * For READ, it also needs to be supported using the same mirror number. +- * +- * If the requested block is not left of the left cursor, EIO is returned. This +- * can happen because btrfs_num_copies() returns one more in the dev-replace +- * case. +- */ +-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, +- u64 logical, u64 length, +- u64 srcdev_devid, int *mirror_num, +- u64 *physical) +-{ +- struct btrfs_io_context *bioc = NULL; +- int num_stripes; +- int index_srcdev = 0; +- int found = 0; +- u64 physical_of_found = 0; +- int i; +- int ret = 0; +- +- ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, +- logical, &length, &bioc, NULL, NULL, 0); +- if (ret) { +- ASSERT(bioc == NULL); +- return ret; +- } +- +- num_stripes = bioc->num_stripes; +- if (*mirror_num > num_stripes) { +- /* +- * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, +- * that means that the requested area is not left of the left +- * cursor +- */ +- btrfs_put_bioc(bioc); +- return -EIO; +- } +- +- /* +- * process the rest of the function using the mirror_num of the source +- * drive. Therefore look it up first. At the end, patch the device +- * pointer to the one of the target drive. +- */ +- for (i = 0; i < num_stripes; i++) { +- if (bioc->stripes[i].dev->devid != srcdev_devid) +- continue; +- +- /* +- * In case of DUP, in order to keep it simple, only add the +- * mirror with the lowest physical address +- */ +- if (found && +- physical_of_found <= bioc->stripes[i].physical) +- continue; +- +- index_srcdev = i; +- found = 1; +- physical_of_found = bioc->stripes[i].physical; +- } +- +- btrfs_put_bioc(bioc); +- +- ASSERT(found); +- if (!found) +- return -EIO; +- +- *mirror_num = index_srcdev + 1; +- *physical = physical_of_found; +- return ret; +-} +- + static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) + { + struct btrfs_block_group *cache; +@@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) + } + + static void handle_ops_on_dev_replace(enum btrfs_map_op op, +- struct btrfs_io_context **bioc_ret, ++ struct btrfs_io_context *bioc, + struct btrfs_dev_replace *dev_replace, + u64 logical, + int *num_stripes_ret, int *max_errors_ret) + { +- struct btrfs_io_context *bioc = *bioc_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; +- int tgtdev_indexes = 0; ++ /* ++ * At this stage, num_stripes is still the real number of stripes, ++ * excluding the duplicated stripes. ++ */ + int num_stripes = *num_stripes_ret; ++ int nr_extra_stripes = 0; + int max_errors = *max_errors_ret; + int i; + +- if (op == BTRFS_MAP_WRITE) { +- int index_where_to_add; ++ /* ++ * A block group which has "to_copy" set will eventually be copied by ++ * the dev-replace process. We can avoid cloning IO here. ++ */ ++ if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) ++ return; + +- /* +- * A block group which have "to_copy" set will eventually +- * copied by dev-replace process. We can avoid cloning IO here. +- */ +- if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical)) +- return; ++ /* ++ * Duplicate the write operations while the dev-replace procedure is ++ * running. Since the copying of the old disk to the new disk takes ++ * place at run time while the filesystem is mounted writable, the ++ * regular write operations to the old disk have to be duplicated to go ++ * to the new disk as well. ++ * ++ * Note that device->missing is handled by the caller, and that the ++ * write to the old disk is already set up in the stripes array. ++ */ ++ for (i = 0; i < num_stripes; i++) { ++ struct btrfs_io_stripe *old = &bioc->stripes[i]; ++ struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes]; + +- /* +- * duplicate the write operations while the dev replace +- * procedure is running. Since the copying of the old disk to +- * the new disk takes place at run time while the filesystem is +- * mounted writable, the regular write operations to the old +- * disk have to be duplicated to go to the new disk as well. +- * +- * Note that device->missing is handled by the caller, and that +- * the write to the old disk is already set up in the stripes +- * array. +- */ +- index_where_to_add = num_stripes; +- for (i = 0; i < num_stripes; i++) { +- if (bioc->stripes[i].dev->devid == srcdev_devid) { +- /* write to new disk, too */ +- struct btrfs_io_stripe *new = +- bioc->stripes + index_where_to_add; +- struct btrfs_io_stripe *old = +- bioc->stripes + i; +- +- new->physical = old->physical; +- new->dev = dev_replace->tgtdev; +- bioc->tgtdev_map[i] = index_where_to_add; +- index_where_to_add++; +- max_errors++; +- tgtdev_indexes++; +- } +- } +- num_stripes = index_where_to_add; +- } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { +- int index_srcdev = 0; +- int found = 0; +- u64 physical_of_found = 0; ++ if (old->dev->devid != srcdev_devid) ++ continue; + +- /* +- * During the dev-replace procedure, the target drive can also +- * be used to read data in case it is needed to repair a corrupt +- * block elsewhere. This is possible if the requested area is +- * left of the left cursor. In this area, the target drive is a +- * full copy of the source drive. +- */ +- for (i = 0; i < num_stripes; i++) { +- if (bioc->stripes[i].dev->devid == srcdev_devid) { +- /* +- * In case of DUP, in order to keep it simple, +- * only add the mirror with the lowest physical +- * address +- */ +- if (found && +- physical_of_found <= bioc->stripes[i].physical) +- continue; +- index_srcdev = i; +- found = 1; +- physical_of_found = bioc->stripes[i].physical; +- } +- } +- if (found) { +- struct btrfs_io_stripe *tgtdev_stripe = +- bioc->stripes + num_stripes; ++ new->physical = old->physical; ++ new->dev = dev_replace->tgtdev; ++ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) ++ bioc->replace_stripe_src = i; ++ nr_extra_stripes++; ++ } ++ ++ /* We can only have at most 2 extra nr_stripes (for DUP). */ ++ ASSERT(nr_extra_stripes <= 2); ++ /* ++ * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for ++ * replace. ++ * If we have 2 extra stripes, only choose the one with smaller physical. ++ */ ++ if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) { ++ struct btrfs_io_stripe *first = &bioc->stripes[num_stripes]; ++ struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; + +- tgtdev_stripe->physical = physical_of_found; +- tgtdev_stripe->dev = dev_replace->tgtdev; +- bioc->tgtdev_map[index_srcdev] = num_stripes; ++ /* Only DUP can have two extra stripes. */ ++ ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + +- tgtdev_indexes++; +- num_stripes++; ++ /* ++ * Swap the last stripe stripes and reduce @nr_extra_stripes. ++ * The extra stripe would still be there, but won't be accessed. ++ */ ++ if (first->physical > second->physical) { ++ swap(second->physical, first->physical); ++ swap(second->dev, first->dev); ++ nr_extra_stripes--; + } + } + +- *num_stripes_ret = num_stripes; +- *max_errors_ret = max_errors; +- bioc->num_tgtdevs = tgtdev_indexes; +- *bioc_ret = bioc; ++ *num_stripes_ret = num_stripes + nr_extra_stripes; ++ *max_errors_ret = max_errors + nr_extra_stripes; ++ bioc->replace_nr_stripes = nr_extra_stripes; + } + + static bool need_full_stripe(enum btrfs_map_op op) +@@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op) + } + + static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, +- u64 offset, u64 *stripe_nr, u64 *stripe_offset, ++ u64 offset, u32 *stripe_nr, u64 *stripe_offset, + u64 *full_stripe_start) + { +- u32 stripe_len = map->stripe_len; +- + ASSERT(op != BTRFS_MAP_DISCARD); + + /* + * Stripe_nr is the stripe where this block falls. stripe_offset is + * the offset of this block in its stripe. + */ +- *stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset); ++ *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; ++ *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(*stripe_offset < U32_MAX); + + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { +- unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); ++ unsigned long full_stripe_len = nr_data_stripes(map) << ++ BTRFS_STRIPE_LEN_SHIFT; + ++ /* ++ * For full stripe start, we use previously calculated ++ * @stripe_nr. Align it to nr_data_stripes, then multiply with ++ * STRIPE_LEN. ++ * ++ * By this we can avoid u64 division completely. And we have ++ * to go rounddown(), not round_down(), as nr_data_stripes is ++ * not ensured to be power of 2. ++ */ + *full_stripe_start = +- div64_u64(offset, full_stripe_len) * full_stripe_len; ++ rounddown(*stripe_nr, nr_data_stripes(map)) << ++ BTRFS_STRIPE_LEN_SHIFT; + + /* + * For writes to RAID56, allow to write a full stripe set, but +@@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, + * a single disk). + */ + if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) +- return stripe_len - *stripe_offset; ++ return BTRFS_STRIPE_LEN - *stripe_offset; + return U64_MAX; + } + + static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, +- u32 stripe_index, u64 stripe_offset, u64 stripe_nr) ++ u32 stripe_index, u64 stripe_offset, u32 stripe_nr) + { + dst->dev = map->stripes[stripe_index].dev; + dst->physical = map->stripes[stripe_index].physical + +- stripe_offset + stripe_nr * map->stripe_len; ++ stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT); + } + + int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, +@@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + struct map_lookup *map; + u64 map_offset; + u64 stripe_offset; +- u64 stripe_nr; +- u64 stripe_len; ++ u32 stripe_nr; + u32 stripe_index; + int data_stripes; + int i; + int ret = 0; + int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + int num_stripes; ++ int num_copies; + int max_errors = 0; +- int tgtdev_indexes = 0; + struct btrfs_io_context *bioc = NULL; + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + int dev_replace_is_ongoing = 0; +- int num_alloc_stripes; +- int patch_the_first_stripe_for_dev_replace = 0; +- u64 physical_to_patch_in_first_stripe = 0; ++ u16 num_alloc_stripes; + u64 raid56_full_stripe_start = (u64)-1; + u64 max_len; + + ASSERT(bioc_ret); + ASSERT(op != BTRFS_MAP_DISCARD); + ++ num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); ++ if (mirror_num > num_copies) ++ return -EINVAL; ++ + em = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + data_stripes = nr_data_stripes(map); +- stripe_len = map->stripe_len; + + map_offset = logical - em->start; + max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, +@@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + if (!dev_replace_is_ongoing) + up_read(&dev_replace->rwsem); + +- if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && +- !need_full_stripe(op) && dev_replace->tgtdev != NULL) { +- ret = get_extra_mirror_from_replace(fs_info, logical, *length, +- dev_replace->srcdev->devid, +- &mirror_num, +- &physical_to_patch_in_first_stripe); +- if (ret) +- goto out; +- else +- patch_the_first_stripe_for_dev_replace = 1; +- } else if (mirror_num > map->num_stripes) { +- mirror_num = 0; +- } +- + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID0) { +- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, +- &stripe_index); ++ stripe_index = stripe_nr % map->num_stripes; ++ stripe_nr /= map->num_stripes; + if (!need_full_stripe(op)) + mirror_num = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { +@@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + u32 factor = map->num_stripes / map->sub_stripes; + +- stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); +- stripe_index *= map->sub_stripes; ++ stripe_index = (stripe_nr % factor) * map->sub_stripes; ++ stripe_nr /= factor; + + if (need_full_stripe(op)) + num_stripes = map->sub_stripes; +@@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + } + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { +- ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); + if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { +- /* push stripe_nr back to the start of the full stripe */ +- stripe_nr = div64_u64(raid56_full_stripe_start, +- stripe_len * data_stripes); ++ /* ++ * Push stripe_nr back to the start of the full stripe ++ * For those cases needing a full stripe, @stripe_nr ++ * is the full stripe number. ++ * ++ * Originally we go raid56_full_stripe_start / full_stripe_len, ++ * but that can be expensive. Here we just divide ++ * @stripe_nr with @data_stripes. ++ */ ++ stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + num_stripes = map->num_stripes; +@@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + +- data_stripes * stripe_len) - logical; ++ (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical; + stripe_index = 0; + stripe_offset = 0; + } else { +@@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + * Mirror #2 is RAID5 parity block. + * Mirror #3 is RAID6 Q block. + */ +- stripe_nr = div_u64_rem(stripe_nr, +- data_stripes, &stripe_index); ++ stripe_index = stripe_nr % data_stripes; ++ stripe_nr /= data_stripes; + if (mirror_num > 1) + stripe_index = data_stripes + mirror_num - 2; + + /* We distribute the parity blocks across stripes */ +- div_u64_rem(stripe_nr + stripe_index, map->num_stripes, +- &stripe_index); ++ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; + if (!need_full_stripe(op) && mirror_num <= 1) + mirror_num = 1; + } + } else { + /* +- * after this, stripe_nr is the number of stripes on this ++ * After this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ +- stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, +- &stripe_index); ++ stripe_index = stripe_nr % map->num_stripes; ++ stripe_nr /= map->num_stripes; + mirror_num = stripe_index + 1; + } + if (stripe_index >= map->num_stripes) { +@@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + } + + num_alloc_stripes = num_stripes; +- if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { +- if (op == BTRFS_MAP_WRITE) +- num_alloc_stripes <<= 1; +- if (op == BTRFS_MAP_GET_READ_MIRRORS) +- num_alloc_stripes++; +- tgtdev_indexes = num_stripes; +- } ++ if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && ++ op != BTRFS_MAP_READ) ++ /* ++ * For replace case, we need to add extra stripes for extra ++ * duplicated stripes. ++ * ++ * For both WRITE and GET_READ_MIRRORS, we may have at most ++ * 2 more stripes (DUP types, otherwise 1). ++ */ ++ num_alloc_stripes += 2; + + /* + * If this I/O maps to a single device, try to return the device and +@@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && + (!need_full_stripe(op) || !dev_replace_is_ongoing || + !dev_replace->tgtdev)) { +- if (patch_the_first_stripe_for_dev_replace) { +- smap->dev = dev_replace->tgtdev; +- smap->physical = physical_to_patch_in_first_stripe; +- *mirror_num_ret = map->num_stripes + 1; +- } else { +- set_io_stripe(smap, map, stripe_index, stripe_offset, +- stripe_nr); +- *mirror_num_ret = mirror_num; +- } ++ set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); ++ *mirror_num_ret = mirror_num; + *bioc_ret = NULL; + ret = 0; + goto out; + } + +- bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); ++ bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + if (!bioc) { + ret = -ENOMEM; + goto out; + } ++ bioc->map_type = map->type; + +- for (i = 0; i < num_stripes; i++) { +- set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset, +- stripe_nr); +- stripe_index++; +- } +- +- /* Build raid_map */ ++ /* ++ * For RAID56 full map, we need to make sure the stripes[] follows the ++ * rule that data stripes are all ordered, then followed with P and Q ++ * (if we have). ++ * ++ * It's still mostly the same as other profiles, just with extra rotation. ++ */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { +- u64 tmp; +- unsigned rot; +- +- /* Work out the disk rotation on this stripe-set */ +- div_u64_rem(stripe_nr, num_stripes, &rot); +- +- /* Fill in the logical address of each stripe */ +- tmp = stripe_nr * data_stripes; +- for (i = 0; i < data_stripes; i++) +- bioc->raid_map[(i + rot) % num_stripes] = +- em->start + (tmp + i) * map->stripe_len; +- +- bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; +- if (map->type & BTRFS_BLOCK_GROUP_RAID6) +- bioc->raid_map[(i + rot + 1) % num_stripes] = +- RAID6_Q_STRIPE; +- +- sort_parity_stripes(bioc, num_stripes); ++ /* ++ * For RAID56 @stripe_nr is already the number of full stripes ++ * before us, which is also the rotation value (needs to modulo ++ * with num_stripes). ++ * ++ * In this case, we just add @stripe_nr with @i, then do the ++ * modulo, to reduce one modulo call. ++ */ ++ bioc->full_stripe_logical = em->start + ++ ((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT); ++ for (i = 0; i < num_stripes; i++) ++ set_io_stripe(&bioc->stripes[i], map, ++ (i + stripe_nr) % num_stripes, ++ stripe_offset, stripe_nr); ++ } else { ++ /* ++ * For all other non-RAID56 profiles, just copy the target ++ * stripe into the bioc. ++ */ ++ for (i = 0; i < num_stripes; i++) { ++ set_io_stripe(&bioc->stripes[i], map, stripe_index, ++ stripe_offset, stripe_nr); ++ stripe_index++; ++ } + } + + if (need_full_stripe(op)) +@@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + need_full_stripe(op)) { +- handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, ++ handle_ops_on_dev_replace(op, bioc, dev_replace, logical, + &num_stripes, &max_errors); + } + + *bioc_ret = bioc; +- bioc->map_type = map->type; + bioc->num_stripes = num_stripes; + bioc->max_errors = max_errors; + bioc->mirror_num = mirror_num; + +- /* +- * this is the case that REQ_READ && dev_replace_is_ongoing && +- * mirror_num == num_stripes + 1 && dev_replace target drive is +- * available as a mirror +- */ +- if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { +- WARN_ON(num_stripes > 1); +- bioc->stripes[0].dev = dev_replace->tgtdev; +- bioc->stripes[0].physical = physical_to_patch_in_first_stripe; +- bioc->mirror_num = map->num_stripes + 1; +- } + out: + if (dev_replace_is_ongoing) { + lockdep_assert_held(&dev_replace->rwsem); +@@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); +- map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = type; + /* + * We can't use the sub_stripes value, as for profiles other than +@@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) + + return true; + } ++ ++static void map_raid56_repair_block(struct btrfs_io_context *bioc, ++ struct btrfs_io_stripe *smap, ++ u64 logical) ++{ ++ int data_stripes = nr_bioc_data_stripes(bioc); ++ int i; ++ ++ for (i = 0; i < data_stripes; i++) { ++ u64 stripe_start = bioc->full_stripe_logical + ++ (i << BTRFS_STRIPE_LEN_SHIFT); ++ ++ if (logical >= stripe_start && ++ logical < stripe_start + BTRFS_STRIPE_LEN) ++ break; ++ } ++ ASSERT(i < data_stripes); ++ smap->dev = bioc->stripes[i].dev; ++ smap->physical = bioc->stripes[i].physical + ++ ((logical - bioc->full_stripe_logical) & ++ BTRFS_STRIPE_LEN_MASK); ++} ++ ++/* ++ * Map a repair write into a single device. ++ * ++ * A repair write is triggered by read time repair or scrub, which would only ++ * update the contents of a single device. ++ * Not update any other mirrors nor go through RMW path. ++ * ++ * Callers should ensure: ++ * ++ * - Call btrfs_bio_counter_inc_blocked() first ++ * - The range does not cross stripe boundary ++ * - Has a valid @mirror_num passed in. ++ */ ++int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ++ struct btrfs_io_stripe *smap, u64 logical, ++ u32 length, int mirror_num) ++{ ++ struct btrfs_io_context *bioc = NULL; ++ u64 map_length = length; ++ int mirror_ret = mirror_num; ++ int ret; ++ ++ ASSERT(mirror_num > 0); ++ ++ ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, ++ &bioc, smap, &mirror_ret, true); ++ if (ret < 0) ++ return ret; ++ ++ /* The map range should not cross stripe boundary. */ ++ ASSERT(map_length >= length); ++ ++ /* Already mapped to single stripe. */ ++ if (!bioc) ++ goto out; ++ ++ /* Map the RAID56 multi-stripe writes to a single one. */ ++ if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ++ map_raid56_repair_block(bioc, smap, logical); ++ goto out; ++ } ++ ++ ASSERT(mirror_num <= bioc->num_stripes); ++ smap->dev = bioc->stripes[mirror_num - 1].dev; ++ smap->physical = bioc->stripes[mirror_num - 1].physical; ++out: ++ btrfs_put_bioc(bioc); ++ ASSERT(smap->dev); ++ return 0; ++} +diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h +index 7e51f2238f72..bf47a1a70813 100644 +--- a/fs/btrfs/volumes.h ++++ b/fs/btrfs/volumes.h +@@ -17,7 +17,11 @@ + + extern struct mutex uuid_mutex; + +-#define BTRFS_STRIPE_LEN SZ_64K ++#define BTRFS_STRIPE_LEN SZ_64K ++#define BTRFS_STRIPE_LEN_SHIFT (16) ++#define BTRFS_STRIPE_LEN_MASK (BTRFS_STRIPE_LEN - 1) ++ ++static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT); + + /* Used by sanity check for btrfs_raid_types. */ + #define const_ffs(n) (__builtin_ctzll(n) + 1) +@@ -404,17 +408,74 @@ struct btrfs_io_context { + u64 map_type; /* get from map_lookup->type */ + struct bio *orig_bio; + atomic_t error; +- int max_errors; +- int num_stripes; +- int mirror_num; +- int num_tgtdevs; +- int *tgtdev_map; ++ u16 max_errors; ++ ++ /* ++ * The total number of stripes, including the extra duplicated ++ * stripe for replace. ++ */ ++ u16 num_stripes; ++ ++ /* ++ * The mirror_num of this bioc. ++ * ++ * This is for reads which use 0 as mirror_num, thus we should return a ++ * valid mirror_num (>0) for the reader. ++ */ ++ u16 mirror_num; ++ ++ /* ++ * The following two members are for dev-replace case only. ++ * ++ * @replace_nr_stripes: Number of duplicated stripes which need to be ++ * written to replace target. ++ * Should be <= 2 (2 for DUP, otherwise <= 1). ++ * @replace_stripe_src: The array indicates where the duplicated stripes ++ * are from. ++ * ++ * The @replace_stripe_src[] array is mostly for RAID56 cases. ++ * As non-RAID56 stripes share the same contents of the mapped range, ++ * thus no need to bother where the duplicated ones are from. ++ * ++ * But for RAID56 case, all stripes contain different contents, thus ++ * we need a way to know the mapping. ++ * ++ * There is an example for the two members, using a RAID5 write: ++ * ++ * num_stripes: 4 (3 + 1 duplicated write) ++ * stripes[0]: dev = devid 1, physical = X ++ * stripes[1]: dev = devid 2, physical = Y ++ * stripes[2]: dev = devid 3, physical = Z ++ * stripes[3]: dev = devid 0, physical = Y ++ * ++ * replace_nr_stripes = 1 ++ * replace_stripe_src = 1 <- Means stripes[1] is involved in replace. ++ * The duplicated stripe index would be ++ * (@num_stripes - 1). ++ * ++ * Note, that we can still have cases replace_nr_stripes = 2 for DUP. ++ * In that case, all stripes share the same content, thus we don't ++ * need to bother @replace_stripe_src value at all. ++ */ ++ u16 replace_nr_stripes; ++ s16 replace_stripe_src; + /* +- * logical block numbers for the start of each stripe +- * The last one or two are p/q. These are sorted, +- * so raid_map[0] is the start of our full stripe ++ * Logical bytenr of the full stripe start, only for RAID56 cases. ++ * ++ * When this value is set to other than (u64)-1, the stripes[] should ++ * follow this pattern: ++ * ++ * (real_stripes = num_stripes - replace_nr_stripes) ++ * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1)) ++ * ++ * stripes[0]: The first data stripe ++ * stripes[1]: The second data stripe ++ * ... ++ * stripes[data_stripes - 1]: The last data stripe ++ * stripes[data_stripes]: The P stripe ++ * stripes[data_stripes + 1]: The Q stripe (only for RAID6). + */ +- u64 *raid_map; ++ u64 full_stripe_logical; + struct btrfs_io_stripe stripes[]; + }; + +@@ -446,7 +507,6 @@ struct map_lookup { + u64 type; + int io_align; + int io_width; +- u32 stripe_len; + int num_stripes; + int sub_stripes; + int verified_stripes; /* For mount time dev extent verification */ +@@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); ++int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ++ struct btrfs_io_stripe *smap, u64 logical, ++ u32 length, int mirror_num); + struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); +diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c +index da7bb9187b68..8acb05e176c5 100644 +--- a/fs/btrfs/zlib.c ++++ b/fs/btrfs/zlib.c +@@ -350,8 +350,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap_local(data_in); +- if (!ret) +- zero_fill_bio(cb->orig_bio); + return ret; + } + +diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c +index 45d04092f2f8..a9b32ba6b2ce 100644 +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio) + { + u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); + struct btrfs_inode *inode = bbio->inode; +- struct btrfs_fs_info *fs_info = inode->root->fs_info; ++ struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_block_group *cache; + bool ret = false; + + if (!btrfs_is_zoned(fs_info)) + return false; + +- if (!is_data_inode(&inode->vfs_inode)) ++ if (!inode || !is_data_inode(&inode->vfs_inode)) + return false; + + if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) +diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c +index e34f1ab99d56..f798da267590 100644 +--- a/fs/btrfs/zstd.c ++++ b/fs/btrfs/zstd.c +@@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) + } + } + ret = 0; +- zero_fill_bio(cb->orig_bio); + done: + if (workspace->in_buf.src) + kunmap_local(workspace->in_buf.src); +diff --git a/include/linux/bio.h b/include/linux/bio.h +index d766be7152e1..b3e7529ff55e 100644 +--- a/include/linux/bio.h ++++ b/include/linux/bio.h +@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio); + void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css); + void bio_clone_blkg_association(struct bio *dst, struct bio *src); ++void blkcg_punt_bio_submit(struct bio *bio); + #else /* CONFIG_BLK_CGROUP */ + static inline void bio_associate_blkg(struct bio *bio) { } + static inline void bio_associate_blkg_from_css(struct bio *bio, +@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio, + { } + static inline void bio_clone_blkg_association(struct bio *dst, + struct bio *src) { } ++static inline void blkcg_punt_bio_submit(struct bio *bio) ++{ ++ submit_bio(bio); ++} + #endif /* CONFIG_BLK_CGROUP */ + + static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) +diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h +index 99be590f952f..fb8843990d28 100644 +--- a/include/linux/blk_types.h ++++ b/include/linux/blk_types.h +@@ -404,18 +404,11 @@ enum req_flag_bits { + __REQ_RAHEAD, /* read ahead, can fail anytime */ + __REQ_BACKGROUND, /* background IO */ + __REQ_NOWAIT, /* Don't wait if request will block */ +- /* +- * When a shared kthread needs to issue a bio for a cgroup, doing +- * so synchronously can lead to priority inversions as the kthread +- * can be trapped waiting for that cgroup. CGROUP_PUNT flag makes +- * submit_bio() punt the actual issuing to a dedicated per-blkcg +- * work item to avoid such priority inversions. +- */ +- __REQ_CGROUP_PUNT, + __REQ_POLLED, /* caller polls for completion using bio_poll */ + __REQ_ALLOC_CACHE, /* allocate IO from cache if available */ + __REQ_SWAP, /* swap I/O */ + __REQ_DRV, /* for driver use */ ++ __REQ_FS_PRIVATE, /* for file system (submitter) use */ + + /* + * Command specific flags, keep last: +@@ -443,14 +436,13 @@ enum req_flag_bits { + #define REQ_RAHEAD (__force blk_opf_t)(1ULL << __REQ_RAHEAD) + #define REQ_BACKGROUND (__force blk_opf_t)(1ULL << __REQ_BACKGROUND) + #define REQ_NOWAIT (__force blk_opf_t)(1ULL << __REQ_NOWAIT) +-#define REQ_CGROUP_PUNT (__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT) +- +-#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) + #define REQ_POLLED (__force blk_opf_t)(1ULL << __REQ_POLLED) + #define REQ_ALLOC_CACHE (__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE) +- +-#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) + #define REQ_SWAP (__force blk_opf_t)(1ULL << __REQ_SWAP) ++#define REQ_DRV (__force blk_opf_t)(1ULL << __REQ_DRV) ++#define REQ_FS_PRIVATE (__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE) ++ ++#define REQ_NOUNMAP (__force blk_opf_t)(1ULL << __REQ_NOUNMAP) + + #define REQ_FAILFAST_MASK \ + (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) +diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h +index bd21af828ff6..357ae4611a45 100644 +--- a/include/linux/crc32c.h ++++ b/include/linux/crc32c.h +@@ -5,7 +5,6 @@ + #include + + extern u32 crc32c(u32 crc, const void *address, unsigned int length); +-extern const char *crc32c_impl(void); + + /* This macro exists for backwards-compatibility. */ + #define crc32c_le crc32c +diff --git a/include/linux/writeback.h b/include/linux/writeback.h +index 46020373e155..fba937999fbf 100644 +--- a/include/linux/writeback.h ++++ b/include/linux/writeback.h +@@ -70,8 +70,6 @@ struct writeback_control { + */ + unsigned no_cgroup_owner:1; + +- unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ +- + /* To enable batching of swap writes to non-block-device backends, + * "plug" can be set point to a 'struct swap_iocb *'. When all swap + * writes have been submitted, if with swap_iocb is not NULL, +@@ -97,9 +95,6 @@ static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc) + { + blk_opf_t flags = 0; + +- if (wbc->punt_to_cgroup) +- flags = REQ_CGROUP_PUNT; +- + if (wbc->sync_mode == WB_SYNC_ALL) + flags |= REQ_SYNC; + else if (wbc->for_kupdate || wbc->for_background) +diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h +index 75d7d22c3a27..8ea9cea9bfeb 100644 +--- a/include/trace/events/btrfs.h ++++ b/include/trace/events/btrfs.h +@@ -2422,7 +2422,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio, + ), + + TP_fast_assign_btrfs(rbio->bioc->fs_info, +- __entry->full_stripe = rbio->bioc->raid_map[0]; ++ __entry->full_stripe = rbio->bioc->full_stripe_logical; + __entry->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + __entry->len = bio->bi_iter.bi_size; + __entry->opf = bio_op(bio); +diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h +index ada0a489bf2b..dbb8b96da50d 100644 +--- a/include/uapi/linux/btrfs.h ++++ b/include/uapi/linux/btrfs.h +@@ -187,6 +187,7 @@ struct btrfs_scrub_progress { + }; + + #define BTRFS_SCRUB_READONLY 1 ++#define BTRFS_SCRUB_SUPPORTED_FLAGS (BTRFS_SCRUB_READONLY) + struct btrfs_ioctl_scrub_args { + __u64 devid; /* in */ + __u64 start; /* in */ +diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c +index 5ca0d815a95d..649e687413a0 100644 +--- a/lib/libcrc32c.c ++++ b/lib/libcrc32c.c +@@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void) + crypto_free_shash(tfm); + } + +-const char *crc32c_impl(void) +-{ +- return crypto_shash_driver_name(tfm); +-} +-EXPORT_SYMBOL(crc32c_impl); +- + module_init(libcrc32c_mod_init); + module_exit(libcrc32c_mod_fini); + +diff --git a/tools/objtool/check.c b/tools/objtool/check.c +index f937be1afe65..060032cfb046 100644 +--- a/tools/objtool/check.c ++++ b/tools/objtool/check.c +@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func, + "__reiserfs_panic", + "__stack_chk_fail", + "__ubsan_handle_builtin_unreachable", ++ "btrfs_assertfail", + "cpu_bringup_and_idle", + "cpu_startup_entry", + "do_exit", +-- +2.40.1 + +From 0ad50219edceae27eb649c5fb76f2b8aebe27e3f Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Apr 2023 18:32:06 +0200 -Subject: [PATCH 5/8] Implement amd-pstate guided driver +Subject: [PATCH 06/10] Implement amd-pstate guided driver Signed-off-by: Peter Jung --- @@ -9873,10 +24825,10 @@ index f5f22418e64b..c10ebf8c42e6 100644 -- 2.40.1 -From bf906393dd0d9e24858f3cfd6a9a5d890817cbf6 Mon Sep 17 00:00:00 2001 +From 3162c47812c5d8dac222403897b3c8f424648c6e Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 17 Apr 2023 18:28:52 +0200 -Subject: [PATCH 6/8] ksm +Subject: [PATCH 07/10] ksm Signed-off-by: Peter Jung --- @@ -10373,10 +25325,1492 @@ index 340125d08c03..36e756355f04 100644 -- 2.40.1 -From 2f73f41267f19f290a306fde77bc648cc321f8d6 Mon Sep 17 00:00:00 2001 +From 26780b606ac659096b0e1a9a2bba12aa747cbf66 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 28 Apr 2023 20:00:54 +0200 +Subject: [PATCH 08/10] Per-VMA locks +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +Previous versions: +v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/ +v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/ +v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/ +RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/ + +LWN article describing the feature: +https://lwn.net/Articles/906852/ + +Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM +last year [2], which concluded with suggestion that “a reader/writer +semaphore could be put into the VMA itself; that would have the effect of +using the VMA as a sort of range lock. There would still be contention at +the VMA level, but it would be an improvement.” This patchset implements +this suggested approach. + +When handling page faults we lookup the VMA that contains the faulting +page under RCU protection and try to acquire its lock. If that fails we +fall back to using mmap_lock, similar to how SPF handled this situation. + +One notable way the implementation deviates from the proposal is the way +VMAs are read-locked. During some of mm updates, multiple VMAs need to be +locked until the end of the update (e.g. vma_merge, split_vma, etc). +Tracking all the locked VMAs, avoiding recursive locks, figuring out when +it's safe to unlock previously locked VMAs would make the code more +complex. So, instead of the usual lock/unlock pattern, the proposed +solution marks a VMA as locked and provides an efficient way to: +1. Identify locked VMAs. +2. Unlock all locked VMAs in bulk. +We also postpone unlocking the locked VMAs until the end of the update, +when we do mmap_write_unlock. Potentially this keeps a VMA locked for +longer than is absolutely necessary but it results in a big reduction of +code complexity. +Read-locking a VMA is done using two sequence numbers - one in the +vm_area_struct and one in the mm_struct. VMA is considered read-locked +when these sequence numbers are equal. To read-lock a VMA we set the +sequence number in vm_area_struct to be equal to the sequence number in +mm_struct. To unlock all VMAs we increment mm_struct's seq number. This +allows for an efficient way to track locked VMAs and to drop the locks on +all VMAs at the end of the update. + +The patchset implements per-VMA locking only for anonymous pages which +are not in swap and avoids userfaultfs as their implementation is more +complex. Additional support for file-back page faults, swapped and user +pages can be added incrementally. + +Performance benchmarks show similar although slightly smaller benefits as +with SPF patchset (~75% of SPF benefits). Still, with lower complexity +this approach might be more desirable. + +Since RFC was posted in September 2022, two separate Google teams outside +of Android evaluated the patchset and confirmed positive results. Here are +the known usecases when per-VMA locks show benefits: + +Android: +Apps with high number of threads (~100) launch times improve by up to 20%. +Each thread mmaps several areas upon startup (Stack and Thread-local +storage (TLS), thread signal stack, indirect ref table), which requires +taking mmap_lock in write mode. Page faults take mmap_lock in read mode. +During app launch, both thread creation and page faults establishing the +active workinget are happening in parallel and that causes lock contention +between mm writers and readers even if updates and page faults are +happening in different VMAs. Per-vma locks prevent this contention by +providing more granular lock. + +Google Fibers: +We have several dynamically sized thread pools that spawn new threads +under increased load and reduce their number when idling. For example, +Google's in-process scheduling/threading framework, UMCG/Fibers, is backed +by such a thread pool. When idling, only a small number of idle worker +threads are available; when a spike of incoming requests arrive, each +request is handled in its own "fiber", which is a work item posted onto a +UMCG worker thread; quite often these spikes lead to a number of new +threads spawning. Each new thread needs to allocate and register an RSEQ +section on its TLS, then register itself with the kernel as a UMCG worker +thread, and only after that it can be considered by the in-process +UMCG/Fiber scheduler as available to do useful work. In short, during an +incoming workload spike new threads have to be spawned, and they perform +several syscalls (RSEQ registration, UMCG worker registration, memory +allocations) before they can actually start doing useful work. Removing +any bottlenecks on this thread startup path will greatly improve our +services' latencies when faced with request/workload spikes. +At high scale, mmap_lock contention during thread creation and stack page +faults leads to user-visible multi-second serving latencies in a similar +pattern to Android app startup. Per-VMA locking patchset has been run +successfully in limited experiments with user-facing production workloads. +In these experiments, we observed that the peak thread creation rate was +high enough that thread creation is no longer a bottleneck. + +TCP zerocopy receive: +From the point of view of TCP zerocopy receive, the per-vma lock patch is +massively beneficial. +In today's implementation, a process with N threads where N - 1 are +performing zerocopy receive and 1 thread is performing madvise() with the +write lock taken (e.g. needs to change vm_flags) will result in all N -1 +receive threads blocking until the madvise is done. Conversely, on a busy +process receiving a lot of data, an madvise operation that does need to +take the mmap lock in write mode will need to wait for all of the receives +to be done - a lose:lose proposition. Per-VMA locking _removes_ by +definition this source of contention entirely. +There are other benefits for receive as well, chiefly a reduction in +cacheline bouncing across receiving threads for locking/unlocking the +single mmap lock. On an RPC style synthetic workload with 4KB RPCs: +1a) The find+lock+unlock VMA path in the base case, without the per-vma +lock patchset, is about 0.7% of cycles as measured by perf. +1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5% +cycles overall - most of this is within the TCP read hotpath (a small +fraction is 'other' usage in the system). +2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial +patch written to take advantage of it in TCP, is about 0.4% of cycles +(down from 0.7% above) +2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1% +cycles and is out of the TCP read hotpath entirely (down from 0.5% before, +the remaining usage is the 'other' usage in the system). +So, in addition to entirely removing an onerous source of contention, it +also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+ +(compared to overall cycles in perf) for the 'small' RPC scenario. + +The patchset structure is: +0001-0008: Enable maple-tree RCU mode +0009-0031: Main per-vma locks patchset +0032-0033: Performance optimizations + +Changes since v3: +- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge +- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo +- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett +- Dropped patch [6] from the set as unnecessary, per Liam Howlett + +[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/ +[2] https://lwn.net/Articles/893906/ +[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/ +[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/ +[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/ +[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/ + +The patchset applies cleanly over mm-unstable branch. + +Laurent Dufour (1): + powerc/mm: try VMA lock-based page fault handling first + +Liam Howlett (4): + maple_tree: Be more cautious about dead nodes + maple_tree: Detect dead nodes in mas_start() + maple_tree: Fix freeing of nodes in rcu mode + maple_tree: remove extra smp_wmb() from mas_dead_leaves() + +Liam R. Howlett (4): + maple_tree: Fix write memory barrier of nodes once dead for RCU mode + maple_tree: Add smp_rmb() to dead node detection + maple_tree: Add RCU lock checking to rcu callback functions + mm: Enable maple tree RCU mode by default. + +Michel Lespinasse (1): + mm: rcu safe VMA freeing + +Suren Baghdasaryan (23): + mm: introduce CONFIG_PER_VMA_LOCK + mm: move mmap_lock assert function definitions + mm: add per-VMA lock and helper functions to control it + mm: mark VMA as being written when changing vm_flags + mm/mmap: move vma_prepare before vma_adjust_trans_huge + mm/khugepaged: write-lock VMA while collapsing a huge page + mm/mmap: write-lock VMAs in vma_prepare before modifying them + mm/mremap: write-lock VMA while remapping it to a new address range + mm: write-lock VMAs before removing them from VMA tree + mm: conditionally write-lock VMA in free_pgtables + kernel/fork: assert no VMA readers during its destruction + mm/mmap: prevent pagefault handler from racing with mmu_notifier + registration + mm: introduce vma detached flag + mm: introduce lock_vma_under_rcu to be used from arch-specific code + mm: fall back to mmap_lock if vma->anon_vma is not yet set + mm: add FAULT_FLAG_VMA_LOCK flag + mm: prevent do_swap_page from handling page faults under VMA lock + mm: prevent userfaults to be handled under per-vma lock + mm: introduce per-VMA lock statistics + x86/mm: try VMA lock-based page fault handling first + arm64/mm: try VMA lock-based page fault handling first + mm/mmap: free vm_area_struct without call_rcu in exit_mmap + mm: separate vma->lock from vm_area_struct + +Signed-off-by: Peter Jung +--- + arch/arm64/Kconfig | 1 + + arch/arm64/mm/fault.c | 36 +++++++ + arch/powerpc/mm/fault.c | 37 +++++++ + arch/powerpc/platforms/powernv/Kconfig | 1 + + arch/powerpc/platforms/pseries/Kconfig | 1 + + arch/s390/Kconfig | 1 + + arch/s390/mm/fault.c | 24 +++++ + arch/x86/Kconfig | 1 + + arch/x86/mm/fault.c | 36 +++++++ + include/linux/mm.h | 127 +++++++++++++++++++++++-- + include/linux/mm_types.h | 30 +++++- + include/linux/mmap_lock.h | 37 ++++--- + include/linux/vm_event_item.h | 6 ++ + include/linux/vmstat.h | 6 ++ + kernel/fork.c | 96 ++++++++++++++++--- + mm/Kconfig | 12 +++ + mm/Kconfig.debug | 6 ++ + mm/init-mm.c | 3 + + mm/internal.h | 2 +- + mm/khugepaged.c | 8 ++ + mm/memory.c | 72 +++++++++++++- + mm/mmap.c | 48 +++++++--- + mm/mremap.c | 1 + + mm/rmap.c | 31 +++--- + mm/vmstat.c | 6 ++ + 25 files changed, 567 insertions(+), 62 deletions(-) + +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 1023e896d46b..6f104c829731 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -95,6 +95,7 @@ config ARM64 + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_PAGE_TABLE_CHECK ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c +index f4cb0f85ccf4..9e0db5c387e3 100644 +--- a/arch/arm64/mm/fault.c ++++ b/arch/arm64/mm/fault.c +@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + unsigned long vm_flags; + unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); ++#ifdef CONFIG_PER_VMA_LOCK ++ struct vm_area_struct *vma; ++#endif + + if (kprobe_page_fault(regs, esr)) + return 0; +@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); + ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(mm_flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ ++ vma = lock_vma_under_rcu(mm, addr); ++ if (!vma) ++ goto lock_mmap; ++ ++ if (!(vma->vm_flags & vm_flags)) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, addr & PAGE_MASK, ++ mm_flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ if (!user_mode(regs)) ++ goto no_context; ++ return 0; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ + /* + * As per x86, we may deadlock here. However, since the kernel only + * validly references user space from well defined areas of the code, +@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr, + } + mmap_read_unlock(mm); + ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + /* + * Handle the "normal" (no error) case first. + */ +diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c +index af46aa88422b..531177a4ee08 100644 +--- a/arch/powerpc/mm/fault.c ++++ b/arch/powerpc/mm/fault.c +@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, + if (is_exec) + flags |= FAULT_FLAG_INSTRUCTION; + ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; ++ ++ if (unlikely(access_pkey_error(is_write, is_exec, ++ (error_code & DSISR_KEYFAULT), vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ ++ if (unlikely(access_error(is_write, is_exec, vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ ++ if (fault_signal_pending(fault, regs)) ++ return user_mode(regs) ? 0 : SIGBUS; ++ ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + /* When running in the kernel we expect faults to occur only to + * addresses in user space. All other faults represent errors in the + * kernel and should generate an OOPS. Unfortunately, in the case of an +@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address, + + mmap_read_unlock(current->mm); + ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + if (unlikely(fault & VM_FAULT_ERROR)) + return mm_fault_error(regs, address, fault); + +diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig +index ae248a161b43..70a46acc70d6 100644 +--- a/arch/powerpc/platforms/powernv/Kconfig ++++ b/arch/powerpc/platforms/powernv/Kconfig +@@ -16,6 +16,7 @@ config PPC_POWERNV + select PPC_DOORBELL + select MMU_NOTIFIER + select FORCE_SMP ++ select ARCH_SUPPORTS_PER_VMA_LOCK + default y + + config OPAL_PRD +diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig +index 21b22bf16ce6..4ebf2ef2845d 100644 +--- a/arch/powerpc/platforms/pseries/Kconfig ++++ b/arch/powerpc/platforms/pseries/Kconfig +@@ -22,6 +22,7 @@ config PPC_PSERIES + select HOTPLUG_CPU + select FORCE_SMP + select SWIOTLB ++ select ARCH_SUPPORTS_PER_VMA_LOCK + default y + + config PARAVIRT +diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig +index 9809c74e1240..548b5b587003 100644 +--- a/arch/s390/Kconfig ++++ b/arch/s390/Kconfig +@@ -120,6 +120,7 @@ config S390 + select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_HUGETLBFS + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANTS_DYNAMIC_TASK_STRUCT +diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c +index a2632fd97d00..b65144c392b0 100644 +--- a/arch/s390/mm/fault.c ++++ b/arch/s390/mm/fault.c +@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access) + access = VM_WRITE; + if (access == VM_WRITE) + flags |= FAULT_FLAG_WRITE; ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; ++ if (!(vma->vm_flags & access)) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto out; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ fault = VM_FAULT_SIGNAL; ++ goto out; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ + mmap_read_lock(mm); + + gmap = NULL; +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index a825bf031f49..df21fba77db1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -27,6 +27,7 @@ config X86_64 + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 ++ select ARCH_SUPPORTS_PER_VMA_LOCK + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA +diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c +index a498ae1fbe66..e4399983c50c 100644 +--- a/arch/x86/mm/fault.c ++++ b/arch/x86/mm/fault.c +@@ -19,6 +19,7 @@ + #include /* faulthandler_disabled() */ + #include /* efi_crash_gracefully_on_page_fault()*/ + #include ++#include /* find_and_lock_vma() */ + + #include /* boot_cpu_has, ... */ + #include /* dotraplinkage, ... */ +@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs, + } + #endif + ++#ifdef CONFIG_PER_VMA_LOCK ++ if (!(flags & FAULT_FLAG_USER)) ++ goto lock_mmap; ++ ++ vma = lock_vma_under_rcu(mm, address); ++ if (!vma) ++ goto lock_mmap; ++ ++ if (unlikely(access_error(error_code, vma))) { ++ vma_end_read(vma); ++ goto lock_mmap; ++ } ++ fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs); ++ vma_end_read(vma); ++ ++ if (!(fault & VM_FAULT_RETRY)) { ++ count_vm_vma_lock_event(VMA_LOCK_SUCCESS); ++ goto done; ++ } ++ count_vm_vma_lock_event(VMA_LOCK_RETRY); ++ ++ /* Quick path to respond to signals */ ++ if (fault_signal_pending(fault, regs)) { ++ if (!user_mode(regs)) ++ kernelmode_fixup_or_oops(regs, error_code, address, ++ SIGBUS, BUS_ADRERR, ++ ARCH_DEFAULT_PKEY); ++ return; ++ } ++lock_mmap: ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + /* + * Kernel-mode access to the user address space should only occur + * on well-defined single instructions listed in the exception +@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs, + } + + mmap_read_unlock(mm); ++#ifdef CONFIG_PER_VMA_LOCK ++done: ++#endif + if (likely(!(fault & VM_FAULT_ERROR))) + return; + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 1f79667824eb..c4c9de7d1916 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code, + struct vm_area_struct *vm_area_alloc(struct mm_struct *); + struct vm_area_struct *vm_area_dup(struct vm_area_struct *); + void vm_area_free(struct vm_area_struct *); ++/* Use only if VMA has no other users */ ++void __vm_area_free(struct vm_area_struct *vma); + + #ifndef CONFIG_MMU + extern struct rb_root nommu_region_tree; +@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags) + { FAULT_FLAG_USER, "USER" }, \ + { FAULT_FLAG_REMOTE, "REMOTE" }, \ + { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ +- { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } ++ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" }, \ ++ { FAULT_FLAG_VMA_LOCK, "VMA_LOCK" } + + /* + * vm_fault is filled by the pagefault handler and passed to the vma's +@@ -623,6 +626,117 @@ struct vm_operations_struct { + unsigned long addr); + }; + ++#ifdef CONFIG_PER_VMA_LOCK ++/* ++ * Try to read-lock a vma. The function is allowed to occasionally yield false ++ * locked result to avoid performance overhead, in which case we fall back to ++ * using mmap_lock. The function should never yield false unlocked result. ++ */ ++static inline bool vma_start_read(struct vm_area_struct *vma) ++{ ++ /* Check before locking. A race might cause false locked result. */ ++ if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) ++ return false; ++ ++ if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) ++ return false; ++ ++ /* ++ * Overflow might produce false locked result. ++ * False unlocked result is impossible because we modify and check ++ * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq ++ * modification invalidates all existing locks. ++ */ ++ if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { ++ up_read(&vma->vm_lock->lock); ++ return false; ++ } ++ return true; ++} ++ ++static inline void vma_end_read(struct vm_area_struct *vma) ++{ ++ rcu_read_lock(); /* keeps vma alive till the end of up_read */ ++ up_read(&vma->vm_lock->lock); ++ rcu_read_unlock(); ++} ++ ++static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq) ++{ ++ mmap_assert_write_locked(vma->vm_mm); ++ ++ /* ++ * current task is holding mmap_write_lock, both vma->vm_lock_seq and ++ * mm->mm_lock_seq can't be concurrently modified. ++ */ ++ *mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq); ++ return (vma->vm_lock_seq == *mm_lock_seq); ++} ++ ++static inline void vma_start_write(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ if (__is_vma_write_locked(vma, &mm_lock_seq)) ++ return; ++ ++ down_write(&vma->vm_lock->lock); ++ vma->vm_lock_seq = mm_lock_seq; ++ up_write(&vma->vm_lock->lock); ++} ++ ++static inline bool vma_try_start_write(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ if (__is_vma_write_locked(vma, &mm_lock_seq)) ++ return true; ++ ++ if (!down_write_trylock(&vma->vm_lock->lock)) ++ return false; ++ ++ vma->vm_lock_seq = mm_lock_seq; ++ up_write(&vma->vm_lock->lock); ++ return true; ++} ++ ++static inline void vma_assert_write_locked(struct vm_area_struct *vma) ++{ ++ int mm_lock_seq; ++ ++ VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma); ++} ++ ++static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached) ++{ ++ /* When detaching vma should be write-locked */ ++ if (detached) ++ vma_assert_write_locked(vma); ++ vma->detached = detached; ++} ++ ++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, ++ unsigned long address); ++ ++#else /* CONFIG_PER_VMA_LOCK */ ++ ++static inline void vma_init_lock(struct vm_area_struct *vma) {} ++static inline bool vma_start_read(struct vm_area_struct *vma) ++ { return false; } ++static inline void vma_end_read(struct vm_area_struct *vma) {} ++static inline void vma_start_write(struct vm_area_struct *vma) {} ++static inline bool vma_try_start_write(struct vm_area_struct *vma) ++ { return true; } ++static inline void vma_assert_write_locked(struct vm_area_struct *vma) {} ++static inline void vma_mark_detached(struct vm_area_struct *vma, ++ bool detached) {} ++ ++#endif /* CONFIG_PER_VMA_LOCK */ ++ ++/* ++ * WARNING: vma_init does not initialize vma->vm_lock. ++ * Use vm_area_alloc()/vm_area_free() if vma needs locking. ++ */ + static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) + { + static const struct vm_operations_struct dummy_vm_ops = {}; +@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) + vma->vm_mm = mm; + vma->vm_ops = &dummy_vm_ops; + INIT_LIST_HEAD(&vma->anon_vma_chain); ++ vma_mark_detached(vma, false); + } + + /* Use when VMA is not part of the VMA tree and needs no locking */ +@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma, + static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + vm_flags_init(vma, flags); + } + + static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags); + } + + static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) |= flags; + } + + static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + ACCESS_PRIVATE(vma, __vm_flags) &= ~flags; + } + +@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma, + static inline void vm_flags_mod(struct vm_area_struct *vma, + vm_flags_t set, vm_flags_t clear) + { +- mmap_assert_write_locked(vma->vm_mm); ++ vma_start_write(vma); + __vm_flags_mod(vma, set, clear); + } + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index a57e6ae78e65..ac4b5df9ba56 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -471,6 +471,10 @@ struct anon_vma_name { + char name[]; + }; + ++struct vma_lock { ++ struct rw_semaphore lock; ++}; ++ + /* + * This struct describes a virtual memory area. There is one of these + * per VM-area/task. A VM area is any part of the process virtual memory +@@ -480,9 +484,16 @@ struct anon_vma_name { + struct vm_area_struct { + /* The first cache line has the info for VMA tree walking. */ + +- unsigned long vm_start; /* Our start address within vm_mm. */ +- unsigned long vm_end; /* The first byte after our end address +- within vm_mm. */ ++ union { ++ struct { ++ /* VMA covers [vm_start; vm_end) addresses within mm */ ++ unsigned long vm_start; ++ unsigned long vm_end; ++ }; ++#ifdef CONFIG_PER_VMA_LOCK ++ struct rcu_head vm_rcu; /* Used for deferred freeing. */ ++#endif ++ }; + + struct mm_struct *vm_mm; /* The address space we belong to. */ + +@@ -501,6 +512,14 @@ struct vm_area_struct { + vm_flags_t __private __vm_flags; + }; + ++#ifdef CONFIG_PER_VMA_LOCK ++ int vm_lock_seq; ++ struct vma_lock *vm_lock; ++ ++ /* Flag to indicate areas detached from the mm->mm_mt tree */ ++ bool detached; ++#endif ++ + /* + * For areas with an address space and backing store, + * linkage into the address_space->i_mmap interval tree. +@@ -637,6 +656,9 @@ struct mm_struct { + * init_mm.mmlist, and are protected + * by mmlist_lock + */ ++#ifdef CONFIG_PER_VMA_LOCK ++ int mm_lock_seq; ++#endif + + + unsigned long hiwater_rss; /* High-watermark of RSS usage */ +@@ -1037,6 +1059,7 @@ typedef struct { + * mapped after the fault. + * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached. + * We should only access orig_pte if this flag set. ++ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two +@@ -1074,6 +1097,7 @@ enum fault_flag { + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, + FAULT_FLAG_UNSHARE = 1 << 10, + FAULT_FLAG_ORIG_PTE_VALID = 1 << 11, ++ FAULT_FLAG_VMA_LOCK = 1 << 12, + }; + + typedef unsigned int __bitwise zap_flags_t; +diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h +index 96e113e23d04..aab8f1b28d26 100644 +--- a/include/linux/mmap_lock.h ++++ b/include/linux/mmap_lock.h +@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) + + #endif /* CONFIG_TRACING */ + ++static inline void mmap_assert_locked(struct mm_struct *mm) ++{ ++ lockdep_assert_held(&mm->mmap_lock); ++ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); ++} ++ ++static inline void mmap_assert_write_locked(struct mm_struct *mm) ++{ ++ lockdep_assert_held_write(&mm->mmap_lock); ++ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); ++} ++ ++#ifdef CONFIG_PER_VMA_LOCK ++static inline void vma_end_write_all(struct mm_struct *mm) ++{ ++ mmap_assert_write_locked(mm); ++ /* No races during update due to exclusive mmap_lock being held */ ++ WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1); ++} ++#else ++static inline void vma_end_write_all(struct mm_struct *mm) {} ++#endif ++ + static inline void mmap_init_lock(struct mm_struct *mm) + { + init_rwsem(&mm->mmap_lock); +@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm) + static inline void mmap_write_unlock(struct mm_struct *mm) + { + __mmap_lock_trace_released(mm, true); ++ vma_end_write_all(mm); + up_write(&mm->mmap_lock); + } + + static inline void mmap_write_downgrade(struct mm_struct *mm) + { + __mmap_lock_trace_acquire_returned(mm, false, true); ++ vma_end_write_all(mm); + downgrade_write(&mm->mmap_lock); + } + +@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) + up_read_non_owner(&mm->mmap_lock); + } + +-static inline void mmap_assert_locked(struct mm_struct *mm) +-{ +- lockdep_assert_held(&mm->mmap_lock); +- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +-} +- +-static inline void mmap_assert_write_locked(struct mm_struct *mm) +-{ +- lockdep_assert_held_write(&mm->mmap_lock); +- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); +-} +- + static inline int mmap_lock_is_contended(struct mm_struct *mm) + { + return rwsem_is_contended(&mm->mmap_lock); +diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h +index 7f5d1caf5890..8abfa1240040 100644 +--- a/include/linux/vm_event_item.h ++++ b/include/linux/vm_event_item.h +@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, + #ifdef CONFIG_X86 + DIRECT_MAP_LEVEL2_SPLIT, + DIRECT_MAP_LEVEL3_SPLIT, ++#endif ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++ VMA_LOCK_SUCCESS, ++ VMA_LOCK_ABORT, ++ VMA_LOCK_RETRY, ++ VMA_LOCK_MISS, + #endif + NR_VM_EVENT_ITEMS + }; +diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h +index 19cf5b6892ce..fed855bae6d8 100644 +--- a/include/linux/vmstat.h ++++ b/include/linux/vmstat.h +@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu) + #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) + #endif + ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++#define count_vm_vma_lock_event(x) count_vm_event(x) ++#else ++#define count_vm_vma_lock_event(x) do {} while (0) ++#endif ++ + #define __count_zid_vm_events(item, zid, delta) \ + __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 349945168239..ebd353730887 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep; + /* SLAB cache for mm_struct structures (tsk->mm) */ + static struct kmem_cache *mm_cachep; + ++#ifdef CONFIG_PER_VMA_LOCK ++ ++/* SLAB cache for vm_area_struct.lock */ ++static struct kmem_cache *vma_lock_cachep; ++ ++static bool vma_lock_alloc(struct vm_area_struct *vma) ++{ ++ vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); ++ if (!vma->vm_lock) ++ return false; ++ ++ init_rwsem(&vma->vm_lock->lock); ++ vma->vm_lock_seq = -1; ++ ++ return true; ++} ++ ++static inline void vma_lock_free(struct vm_area_struct *vma) ++{ ++ kmem_cache_free(vma_lock_cachep, vma->vm_lock); ++} ++ ++#else /* CONFIG_PER_VMA_LOCK */ ++ ++static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } ++static inline void vma_lock_free(struct vm_area_struct *vma) {} ++ ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) + { + struct vm_area_struct *vma; + + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); +- if (vma) +- vma_init(vma, mm); ++ if (!vma) ++ return NULL; ++ ++ vma_init(vma, mm); ++ if (!vma_lock_alloc(vma)) { ++ kmem_cache_free(vm_area_cachep, vma); ++ return NULL; ++ } ++ + return vma; + } + +@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) + { + struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + +- if (new) { +- ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); +- ASSERT_EXCLUSIVE_WRITER(orig->vm_file); +- /* +- * orig->shared.rb may be modified concurrently, but the clone +- * will be reinitialized. +- */ +- data_race(memcpy(new, orig, sizeof(*new))); +- INIT_LIST_HEAD(&new->anon_vma_chain); +- dup_anon_vma_name(orig, new); ++ if (!new) ++ return NULL; ++ ++ ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ++ ASSERT_EXCLUSIVE_WRITER(orig->vm_file); ++ /* ++ * orig->shared.rb may be modified concurrently, but the clone ++ * will be reinitialized. ++ */ ++ data_race(memcpy(new, orig, sizeof(*new))); ++ if (!vma_lock_alloc(new)) { ++ kmem_cache_free(vm_area_cachep, new); ++ return NULL; + } ++ INIT_LIST_HEAD(&new->anon_vma_chain); ++ dup_anon_vma_name(orig, new); ++ + return new; + } + +-void vm_area_free(struct vm_area_struct *vma) ++void __vm_area_free(struct vm_area_struct *vma) + { + free_anon_vma_name(vma); ++ vma_lock_free(vma); + kmem_cache_free(vm_area_cachep, vma); + } + ++#ifdef CONFIG_PER_VMA_LOCK ++static void vm_area_free_rcu_cb(struct rcu_head *head) ++{ ++ struct vm_area_struct *vma = container_of(head, struct vm_area_struct, ++ vm_rcu); ++ ++ /* The vma should not be locked while being destroyed. */ ++ VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); ++ __vm_area_free(vma); ++} ++#endif ++ ++void vm_area_free(struct vm_area_struct *vma) ++{ ++#ifdef CONFIG_PER_VMA_LOCK ++ call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb); ++#else ++ __vm_area_free(vma); ++#endif ++} ++ + static void account_kernel_stack(struct task_struct *tsk, int account) + { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { +@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + seqcount_init(&mm->write_protect_seq); + mmap_init_lock(mm); + INIT_LIST_HEAD(&mm->mmlist); ++#ifdef CONFIG_PER_VMA_LOCK ++ mm->mm_lock_seq = 0; ++#endif + mm_pgtables_bytes_init(mm); + mm->map_count = 0; + mm->locked_vm = 0; +@@ -3074,6 +3141,9 @@ void __init proc_caches_init(void) + NULL); + + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); ++#ifdef CONFIG_PER_VMA_LOCK ++ vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); ++#endif + mmap_init(); + nsproxy_cache_init(); + } +diff --git a/mm/Kconfig b/mm/Kconfig +index cf2e47030fe8..459af2123189 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS + This option has a per-memcg and per-node memory overhead. + # } + ++config ARCH_SUPPORTS_PER_VMA_LOCK ++ def_bool n ++ ++config PER_VMA_LOCK ++ def_bool y ++ depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP ++ help ++ Allow per-vma locking during page fault handling. ++ ++ This feature allows locking each virtual memory area separately when ++ handling page faults instead of taking mmap_lock. ++ + source "mm/damon/Kconfig" + + endmenu +diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug +index c3547a373c9c..4965a7333a3f 100644 +--- a/mm/Kconfig.debug ++++ b/mm/Kconfig.debug +@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN + + If unsure, say Y. + ++config PER_VMA_LOCK_STATS ++ bool "Statistics for per-vma locks" ++ depends on PER_VMA_LOCK ++ default y ++ help ++ Statistics for per-vma locks. +diff --git a/mm/init-mm.c b/mm/init-mm.c +index c9327abb771c..33269314e060 100644 +--- a/mm/init-mm.c ++++ b/mm/init-mm.c +@@ -37,6 +37,9 @@ struct mm_struct init_mm = { + .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), + .mmlist = LIST_HEAD_INIT(init_mm.mmlist), ++#ifdef CONFIG_PER_VMA_LOCK ++ .mm_lock_seq = 0, ++#endif + .user_ns = &init_user_ns, + .cpu_bitmap = CPU_BITS_NONE, + #ifdef CONFIG_IOMMU_SVA +diff --git a/mm/internal.h b/mm/internal.h +index 7920a8b7982e..0c455d6e4e3e 100644 +--- a/mm/internal.h ++++ b/mm/internal.h +@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio); + + void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, +- unsigned long ceiling); ++ unsigned long ceiling, bool mm_wr_locked); + void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); + + struct zap_details; +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 0ec69b96b497..37a52a0ec9da 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -1053,6 +1053,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + if (result != SCAN_SUCCEED) + goto out_up_write; + ++ vma_start_write(vma); + anon_vma_lock_write(vma->anon_vma); + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address, +@@ -1516,6 +1517,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + goto drop_hpage; + } + ++ /* Lock the vma before taking i_mmap and page table locks */ ++ vma_start_write(vma); ++ + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that +@@ -1693,6 +1697,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { ++ /* trylock for the same lock inversion as above */ ++ if (!vma_try_start_write(vma)) ++ goto unlock_next; ++ + /* + * Re-check whether we have an ->anon_vma, because + * collapse_and_free_pmd() requires that either no +diff --git a/mm/memory.c b/mm/memory.c +index 01a23ad48a04..c76183ced67a 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -348,7 +348,7 @@ void free_pgd_range(struct mmu_gather *tlb, + + void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, +- unsigned long ceiling) ++ unsigned long ceiling, bool mm_wr_locked) + { + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + +@@ -366,6 +366,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables + */ ++ if (mm_wr_locked) ++ vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma(vma); + +@@ -380,6 +382,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + && !is_vm_hugetlb_page(next)) { + vma = next; + next = mas_find(&mas, ceiling - 1); ++ if (mm_wr_locked) ++ vma_start_write(vma); + unlink_anon_vmas(vma); + unlink_file_vma(vma); + } +@@ -3698,6 +3702,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) + if (!pte_unmap_same(vmf)) + goto out; + ++ if (vmf->flags & FAULT_FLAG_VMA_LOCK) { ++ ret = VM_FAULT_RETRY; ++ goto out; ++ } ++ + entry = pte_to_swp_entry(vmf->orig_pte); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { +@@ -5230,6 +5239,67 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, + } + EXPORT_SYMBOL_GPL(handle_mm_fault); + ++#ifdef CONFIG_PER_VMA_LOCK ++/* ++ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be ++ * stable and not isolated. If the VMA is not found or is being modified the ++ * function returns NULL. ++ */ ++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, ++ unsigned long address) ++{ ++ MA_STATE(mas, &mm->mm_mt, address, address); ++ struct vm_area_struct *vma; ++ ++ rcu_read_lock(); ++retry: ++ vma = mas_walk(&mas); ++ if (!vma) ++ goto inval; ++ ++ /* Only anonymous vmas are supported for now */ ++ if (!vma_is_anonymous(vma)) ++ goto inval; ++ ++ /* find_mergeable_anon_vma uses adjacent vmas which are not locked */ ++ if (!vma->anon_vma) ++ goto inval; ++ ++ if (!vma_start_read(vma)) ++ goto inval; ++ ++ /* ++ * Due to the possibility of userfault handler dropping mmap_lock, avoid ++ * it for now and fall back to page fault handling under mmap_lock. ++ */ ++ if (userfaultfd_armed(vma)) { ++ vma_end_read(vma); ++ goto inval; ++ } ++ ++ /* Check since vm_start/vm_end might change before we lock the VMA */ ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ vma_end_read(vma); ++ goto inval; ++ } ++ ++ /* Check if the VMA got isolated after we found it */ ++ if (vma->detached) { ++ vma_end_read(vma); ++ count_vm_vma_lock_event(VMA_LOCK_MISS); ++ /* The area was replaced with another one */ ++ goto retry; ++ } ++ ++ rcu_read_unlock(); ++ return vma; ++inval: ++ rcu_read_unlock(); ++ count_vm_vma_lock_event(VMA_LOCK_ABORT); ++ return NULL; ++} ++#endif /* CONFIG_PER_VMA_LOCK */ ++ + #ifndef __PAGETABLE_P4D_FOLDED + /* + * Allocate p4d page table. +diff --git a/mm/mmap.c b/mm/mmap.c +index d5475fbf5729..cbac45aa39ae 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma) + /* + * Close a vm structure and free it. + */ +-static void remove_vma(struct vm_area_struct *vma) ++static void remove_vma(struct vm_area_struct *vma, bool unreachable) + { + might_sleep(); + if (vma->vm_ops && vma->vm_ops->close) +@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma) + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); +- vm_area_free(vma); ++ if (unreachable) ++ __vm_area_free(vma); ++ else ++ vm_area_free(vma); + } + + static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, +@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp, + */ + static inline void vma_prepare(struct vma_prepare *vp) + { ++ vma_start_write(vp->vma); ++ if (vp->adj_next) ++ vma_start_write(vp->adj_next); ++ /* vp->insert is always a newly created VMA, no need for locking */ ++ if (vp->remove) ++ vma_start_write(vp->remove); ++ if (vp->remove2) ++ vma_start_write(vp->remove2); ++ + if (vp->file) { + uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end); + +@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp, + + if (vp->remove) { + again: ++ vma_mark_detached(vp->remove, true); + if (vp->file) { + uprobe_munmap(vp->remove, vp->remove->vm_start, + vp->remove->vm_end); +@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (vma_iter_prealloc(vmi)) + goto nomem; + ++ vma_prepare(&vp); + vma_adjust_trans_huge(vma, start, end, 0); + /* VMA iterator points to previous, so set to start if necessary */ + if (vma_iter_addr(vmi) != start) + vma_iter_set(vmi, start); + +- vma_prepare(&vp); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; +@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, + return -ENOMEM; + + init_vma_prep(&vp, vma); +- vma_adjust_trans_huge(vma, start, end, 0); + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, start, end, 0); + + if (vma->vm_start < start) + vma_iter_clear(vmi, vma->vm_start, start); +@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm, + if (vma_iter_prealloc(vmi)) + return NULL; + +- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + init_multi_vma_prep(&vp, vma, adjust, remove, remove2); + VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma && + vp.anon_vma != adjust->anon_vma); + + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next); + if (vma_start < vma->vm_start || vma_end > vma->vm_end) + vma_expanded = true; + +@@ -2157,7 +2170,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += nrpages; + vm_stat_account(mm, vma->vm_flags, -nrpages); +- remove_vma(vma); ++ remove_vma(vma, false); + } + vm_unacct_memory(nr_accounted); + validate_mm(mm); +@@ -2180,7 +2193,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, + update_hiwater_rss(mm); + unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, +- next ? next->vm_start : USER_PGTABLES_CEILING); ++ next ? next->vm_start : USER_PGTABLES_CEILING, ++ mm_wr_locked); + tlb_finish_mmu(&tlb); + } + +@@ -2236,10 +2250,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (new->vm_ops && new->vm_ops->open) + new->vm_ops->open(new); + +- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + init_vma_prep(&vp, vma); + vp.insert = new; + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0); + + if (new_below) { + vma->vm_start = addr; +@@ -2283,10 +2297,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, + static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) + { ++ vma_start_write(vma); + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; + ++ vma_mark_detached(vma, true); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); + +@@ -2942,9 +2958,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma, + if (vma_iter_prealloc(vmi)) + goto unacct_fail; + +- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + init_vma_prep(&vp, vma); + vma_prepare(&vp); ++ vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); + vma->vm_end = addr + len; + vm_flags_set(vma, VM_SOFTDIRTY); + vma_iter_store(vmi, vma); +@@ -3077,7 +3093,7 @@ void exit_mmap(struct mm_struct *mm) + mmap_write_lock(mm); + mt_clear_in_rcu(&mm->mm_mt); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, +- USER_PGTABLES_CEILING); ++ USER_PGTABLES_CEILING, true); + tlb_finish_mmu(&tlb); + + /* +@@ -3088,7 +3104,7 @@ void exit_mmap(struct mm_struct *mm) + do { + if (vma->vm_flags & VM_ACCOUNT) + nr_accounted += vma_pages(vma); +- remove_vma(vma); ++ remove_vma(vma, true); + count++; + cond_resched(); + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); +@@ -3211,6 +3227,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, + get_file(new_vma->vm_file); + if (new_vma->vm_ops && new_vma->vm_ops->open) + new_vma->vm_ops->open(new_vma); ++ vma_start_write(new_vma); + if (vma_link(mm, new_vma)) + goto out_vma_link; + *need_rmap_locks = false; +@@ -3505,6 +3522,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) + * of mm/rmap.c: + * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for + * hugetlb mapping); ++ * - all vmas marked locked + * - all i_mmap_rwsem locks; + * - all anon_vma->rwseml + * +@@ -3527,6 +3545,13 @@ int mm_take_all_locks(struct mm_struct *mm) + + mutex_lock(&mm_all_locks_mutex); + ++ mas_for_each(&mas, vma, ULONG_MAX) { ++ if (signal_pending(current)) ++ goto out_unlock; ++ vma_start_write(vma); ++ } ++ ++ mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { + if (signal_pending(current)) + goto out_unlock; +@@ -3616,6 +3641,7 @@ void mm_drop_all_locks(struct mm_struct *mm) + if (vma->vm_file && vma->vm_file->f_mapping) + vm_unlock_mapping(vma->vm_file->f_mapping); + } ++ vma_end_write_all(mm); + + mutex_unlock(&mm_all_locks_mutex); + } +diff --git a/mm/mremap.c b/mm/mremap.c +index 411a85682b58..dd541e59edda 100644 +--- a/mm/mremap.c ++++ b/mm/mremap.c +@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, + return -ENOMEM; + } + ++ vma_start_write(vma); + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); + new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff, + &need_rmap_locks); +diff --git a/mm/rmap.c b/mm/rmap.c +index 8632e02661ac..cfdaa56cad3e 100644 +--- a/mm/rmap.c ++++ b/mm/rmap.c +@@ -25,21 +25,22 @@ + * mapping->invalidate_lock (in filemap_fault) + * page->flags PG_locked (lock_page) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) +- * mapping->i_mmap_rwsem +- * anon_vma->rwsem +- * mm->page_table_lock or pte_lock +- * swap_lock (in swap_duplicate, swap_info_get) +- * mmlist_lock (in mmput, drain_mmlist and others) +- * mapping->private_lock (in block_dirty_folio) +- * folio_lock_memcg move_lock (in block_dirty_folio) +- * i_pages lock (widely used) +- * lruvec->lru_lock (in folio_lruvec_lock_irq) +- * inode->i_lock (in set_page_dirty's __mark_inode_dirty) +- * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) +- * sb_lock (within inode_lock in fs/fs-writeback.c) +- * i_pages lock (widely used, in set_page_dirty, +- * in arch-dependent flush_dcache_mmap_lock, +- * within bdi.wb->list_lock in __sync_single_inode) ++ * vma_start_write ++ * mapping->i_mmap_rwsem ++ * anon_vma->rwsem ++ * mm->page_table_lock or pte_lock ++ * swap_lock (in swap_duplicate, swap_info_get) ++ * mmlist_lock (in mmput, drain_mmlist and others) ++ * mapping->private_lock (in block_dirty_folio) ++ * folio_lock_memcg move_lock (in block_dirty_folio) ++ * i_pages lock (widely used) ++ * lruvec->lru_lock (in folio_lruvec_lock_irq) ++ * inode->i_lock (in set_page_dirty's __mark_inode_dirty) ++ * bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty) ++ * sb_lock (within inode_lock in fs/fs-writeback.c) ++ * i_pages lock (widely used, in set_page_dirty, ++ * in arch-dependent flush_dcache_mmap_lock, ++ * within bdi.wb->list_lock in __sync_single_inode) + * + * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) + * ->tasklist_lock +diff --git a/mm/vmstat.c b/mm/vmstat.c +index 1ea6a5ce1c41..4f1089a1860e 100644 +--- a/mm/vmstat.c ++++ b/mm/vmstat.c +@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = { + "direct_map_level2_splits", + "direct_map_level3_splits", + #endif ++#ifdef CONFIG_PER_VMA_LOCK_STATS ++ "vma_lock_success", ++ "vma_lock_abort", ++ "vma_lock_retry", ++ "vma_lock_miss", ++#endif + #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ + }; + #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ +-- +2.40.1 + +From 56fd0f1397471be0786d1f696598173b9ebb9a35 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 25 Apr 2023 17:19:06 +0200 -Subject: [PATCH 7/8] sched +Subject: [PATCH 09/10] sched Signed-off-by: Peter Jung --- @@ -11142,10 +27576,10 @@ index 3e8df6d31c1e..7331d436ebc4 100644 -- 2.40.1 -From 27d4dbfc6971caf5627a8248adef49f8d15340b4 Mon Sep 17 00:00:00 2001 +From fed8faa97161f725528a30330a22a3ba5b8e9965 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sat, 22 Apr 2023 11:46:46 +0200 -Subject: [PATCH 8/8] zstd: import 1.5.5 +Subject: [PATCH 10/10] zstd: import 1.5.5 Signed-off-by: Peter Jung ---