From ee05cd3fcec3b8605c7cb56f8d5c373c34497213 Mon Sep 17 00:00:00 2001
From: ferrreo <harderthanfire@gmail.com>
Date: Fri, 28 Apr 2023 19:51:43 +0100
Subject: [PATCH] Update cachy patchset

---
 patches/0001-cachy-all.patch | 16604 ++++++++++++++++++++++++++++++++-
 1 file changed, 16519 insertions(+), 85 deletions(-)

diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
index b307560..f7a6979 100644
--- a/patches/0001-cachy-all.patch
+++ b/patches/0001-cachy-all.patch
@@ -1,7 +1,7 @@
-From a2522409b71cfd3a4f7fc95effca4c322adaf7b0 Mon Sep 17 00:00:00 2001
+From 0ca55b20120a052c587868cb3199edaa41634a3b Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 6 Mar 2023 18:43:03 +0100
-Subject: [PATCH 1/8] bbr2
+Subject: [PATCH 01/10] bbr2
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -3283,10 +3283,10 @@ index cb79127f45c3..70e4de876a7f 100644
 -- 
 2.40.1
 
-From 0d9e557b60746641c464bab65aae86fd78cb9024 Mon Sep 17 00:00:00 2001
+From 0927bc0b168ee599f356a757df60102be68472dc Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 17 Apr 2023 18:21:50 +0200
-Subject: [PATCH 2/8] bfq
+Subject: [PATCH 02/10] bfq
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -3329,79 +3329,81 @@ index d9ed3108c17a..66146bbcd4af 100644
 -- 
 2.40.1
 
-From 7b6e9ae435973f69a18f51d226879b128fa6026f Mon Sep 17 00:00:00 2001
+From 978269efc945dfd3e330da87db88188fab9b92c1 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Wed, 26 Apr 2023 22:04:07 +0200
-Subject: [PATCH 3/8] cachy
+Date: Fri, 28 Apr 2023 19:58:48 +0200
+Subject: [PATCH 03/10] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- .gitignore                                    |   1 +
- .../admin-guide/kernel-parameters.txt         |  12 +
- Documentation/dontdiff                        |   1 +
- Makefile                                      |   8 +-
- arch/arc/configs/axs101_defconfig             |   1 +
- arch/arc/configs/axs103_defconfig             |   1 +
- arch/arc/configs/axs103_smp_defconfig         |   1 +
- arch/arc/configs/haps_hs_defconfig            |   1 +
- arch/arc/configs/haps_hs_smp_defconfig        |   1 +
- arch/arc/configs/hsdk_defconfig               |   1 +
- arch/arc/configs/nsim_700_defconfig           |   1 +
- arch/arc/configs/nsimosci_defconfig           |   1 +
- arch/arc/configs/nsimosci_hs_defconfig        |   1 +
- arch/arc/configs/nsimosci_hs_smp_defconfig    |   1 +
- arch/arc/configs/tb10x_defconfig              |   1 +
- arch/arc/configs/vdk_hs38_defconfig           |   1 +
- arch/arc/configs/vdk_hs38_smp_defconfig       |   1 +
- arch/x86/Kconfig.cpu                          | 416 ++++++++++-
- arch/x86/Makefile                             |  45 +-
- arch/x86/Makefile.postlink                    |  41 ++
- arch/x86/boot/compressed/.gitignore           |   1 -
- arch/x86/boot/compressed/Makefile             |  10 +-
- arch/x86/include/asm/pci.h                    |   6 +
- arch/x86/include/asm/vermagic.h               |  72 ++
- arch/x86/pci/common.c                         |   7 +-
- drivers/Makefile                              |  15 +-
- drivers/ata/ahci.c                            |  23 +-
- drivers/cpufreq/Kconfig.x86                   |   2 -
- drivers/cpufreq/intel_pstate.c                |   2 +
- drivers/i2c/busses/Kconfig                    |   9 +
- drivers/i2c/busses/Makefile                   |   1 +
- drivers/i2c/busses/i2c-nct6775.c              | 647 ++++++++++++++++++
- drivers/i2c/busses/i2c-piix4.c                |   4 +-
- drivers/md/dm-crypt.c                         |   5 +
- drivers/pci/controller/Makefile               |   6 +
- drivers/pci/controller/intel-nvme-remap.c     | 462 +++++++++++++
- drivers/pci/quirks.c                          | 101 +++
- drivers/platform/x86/Kconfig                  |  14 +
- drivers/platform/x86/Makefile                 |   3 +
- drivers/platform/x86/steamdeck.c              | 523 ++++++++++++++
- include/linux/pagemap.h                       |   2 +-
- include/linux/user_namespace.h                |   4 +
- include/net/netns/ipv4.h                      |   1 +
- include/trace/events/tcp.h                    |   7 +
- init/Kconfig                                  |  39 ++
- kernel/Kconfig.hz                             |  24 +
- kernel/fork.c                                 |  14 +
- kernel/module/Kconfig                         |  25 +
- kernel/sched/fair.c                           |  20 +-
- kernel/sysctl.c                               |  12 +
- kernel/user_namespace.c                       |   7 +
- mm/Kconfig                                    |   2 +-
- mm/compaction.c                               |   4 +
- mm/page-writeback.c                           |   8 +
- mm/swap.c                                     |   5 +
- mm/vmpressure.c                               |   4 +
- mm/vmscan.c                                   |   8 +
- net/ipv4/sysctl_net_ipv4.c                    |   7 +
- net/ipv4/tcp_input.c                          |  36 +
- net/ipv4/tcp_ipv4.c                           |   2 +
- scripts/Makefile.lib                          |  13 +-
- scripts/Makefile.modinst                      |   7 +-
- 62 files changed, 2637 insertions(+), 64 deletions(-)
+ .gitignore                                    |    1 +
+ .../admin-guide/kernel-parameters.txt         |   12 +
+ Documentation/dontdiff                        |    1 +
+ Makefile                                      |    8 +-
+ arch/arc/configs/axs101_defconfig             |    1 +
+ arch/arc/configs/axs103_defconfig             |    1 +
+ arch/arc/configs/axs103_smp_defconfig         |    1 +
+ arch/arc/configs/haps_hs_defconfig            |    1 +
+ arch/arc/configs/haps_hs_smp_defconfig        |    1 +
+ arch/arc/configs/hsdk_defconfig               |    1 +
+ arch/arc/configs/nsim_700_defconfig           |    1 +
+ arch/arc/configs/nsimosci_defconfig           |    1 +
+ arch/arc/configs/nsimosci_hs_defconfig        |    1 +
+ arch/arc/configs/nsimosci_hs_smp_defconfig    |    1 +
+ arch/arc/configs/tb10x_defconfig              |    1 +
+ arch/arc/configs/vdk_hs38_defconfig           |    1 +
+ arch/arc/configs/vdk_hs38_smp_defconfig       |    1 +
+ arch/x86/Kconfig.cpu                          |  416 ++-
+ arch/x86/Makefile                             |   45 +-
+ arch/x86/Makefile.postlink                    |   41 +
+ arch/x86/boot/compressed/.gitignore           |    1 -
+ arch/x86/boot/compressed/Makefile             |   10 +-
+ arch/x86/include/asm/pci.h                    |    6 +
+ arch/x86/include/asm/vermagic.h               |   72 +
+ arch/x86/pci/common.c                         |    7 +-
+ drivers/Makefile                              |   15 +-
+ drivers/ata/ahci.c                            |   23 +-
+ drivers/cpufreq/Kconfig.x86                   |    2 -
+ drivers/cpufreq/intel_pstate.c                |    2 +
+ drivers/i2c/busses/Kconfig                    |    9 +
+ drivers/i2c/busses/Makefile                   |    1 +
+ drivers/i2c/busses/i2c-nct6775.c              |  647 ++++
+ drivers/i2c/busses/i2c-piix4.c                |    4 +-
+ drivers/md/dm-crypt.c                         |    5 +
+ drivers/pci/controller/Makefile               |    6 +
+ drivers/pci/controller/intel-nvme-remap.c     |  462 +++
+ drivers/pci/quirks.c                          |  101 +
+ drivers/platform/x86/Kconfig                  |   24 +
+ drivers/platform/x86/Makefile                 |    4 +
+ drivers/platform/x86/legion-laptop.c          | 2783 +++++++++++++++++
+ drivers/platform/x86/steamdeck.c              |  523 ++++
+ include/linux/pagemap.h                       |    2 +-
+ include/linux/user_namespace.h                |    4 +
+ include/net/netns/ipv4.h                      |    1 +
+ include/trace/events/tcp.h                    |    7 +
+ init/Kconfig                                  |   39 +
+ kernel/Kconfig.hz                             |   24 +
+ kernel/fork.c                                 |   14 +
+ kernel/module/Kconfig                         |   25 +
+ kernel/sched/fair.c                           |   20 +-
+ kernel/sysctl.c                               |   12 +
+ kernel/user_namespace.c                       |    7 +
+ mm/Kconfig                                    |    2 +-
+ mm/compaction.c                               |    4 +
+ mm/page-writeback.c                           |    8 +
+ mm/swap.c                                     |    5 +
+ mm/vmpressure.c                               |    4 +
+ mm/vmscan.c                                   |    8 +
+ net/ipv4/sysctl_net_ipv4.c                    |    7 +
+ net/ipv4/tcp_input.c                          |   36 +
+ net/ipv4/tcp_ipv4.c                           |    2 +
+ scripts/Makefile.lib                          |   13 +-
+ scripts/Makefile.modinst                      |    7 +-
+ 63 files changed, 5431 insertions(+), 64 deletions(-)
  create mode 100644 arch/x86/Makefile.postlink
  create mode 100644 drivers/i2c/busses/i2c-nct6775.c
  create mode 100644 drivers/pci/controller/intel-nvme-remap.c
+ create mode 100644 drivers/platform/x86/legion-laptop.c
  create mode 100644 drivers/platform/x86/steamdeck.c
 
 diff --git a/.gitignore b/.gitignore
@@ -5889,10 +5891,27 @@ index 44cab813bf95..25edf55de985 100644
  };
  
 diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
-index 4a01b315e0a9..e9ddf76b8b57 100644
+index 4a01b315e0a9..e4a6c31a80df 100644
 --- a/drivers/platform/x86/Kconfig
 +++ b/drivers/platform/x86/Kconfig
-@@ -1099,6 +1099,20 @@ config WINMATE_FM07_KEYS
+@@ -641,6 +641,16 @@ config THINKPAD_LMI
+ 	  To compile this driver as a module, choose M here: the module will
+ 	  be called think-lmi.
+ 
++config LEGION_LAPTOP
++	tristate "Lenovo Legion Laptop Extras"
++	depends on ACPI
++	depends on ACPI_WMI || ACPI_WMI = n
++	depends on HWMON || HWMON = n
++	select ACPI_PLATFORM_PROFILE
++	help
++	  This is a driver for Lenovo Legion laptops and contains drivers for
++	  hotkey, fan control, and power mode.
++
+ source "drivers/platform/x86/intel/Kconfig"
+ 
+ config MSI_LAPTOP
+@@ -1099,6 +1109,20 @@ config WINMATE_FM07_KEYS
  	  buttons below the display. This module adds an input device
  	  that delivers key events when these buttons are pressed.
  
@@ -5914,16 +5933,2813 @@ index 4a01b315e0a9..e9ddf76b8b57 100644
  
  config P2SB
 diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
-index 1d3d1b02541b..75b30a3face9 100644
+index 1d3d1b02541b..fde9a683103e 100644
 --- a/drivers/platform/x86/Makefile
 +++ b/drivers/platform/x86/Makefile
-@@ -134,3 +134,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC)	+= simatic-ipc.o
+@@ -66,6 +66,7 @@ obj-$(CONFIG_IDEAPAD_LAPTOP)	+= ideapad-laptop.o
+ obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
+ obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
+ obj-$(CONFIG_THINKPAD_LMI)	+= think-lmi.o
++obj-$(CONFIG_LEGION_LAPTOP)	+= legion-laptop.o
+ 
+ # Intel
+ obj-y				+= intel/
+@@ -134,3 +135,6 @@ obj-$(CONFIG_SIEMENS_SIMATIC_IPC)	+= simatic-ipc.o
  
  # Winmate
  obj-$(CONFIG_WINMATE_FM07_KEYS)		+= winmate-fm07-keys.o
 +
 +# Steam Deck
 +obj-$(CONFIG_STEAMDECK)			+= steamdeck.o
+diff --git a/drivers/platform/x86/legion-laptop.c b/drivers/platform/x86/legion-laptop.c
+new file mode 100644
+index 000000000000..d1268d239cc5
+--- /dev/null
++++ b/drivers/platform/x86/legion-laptop.c
+@@ -0,0 +1,2783 @@
++// SPDX-License-Identifier: GPL-2.0-or-later
++/*
++ *  legion-laptop.c - Extra Lenovo Legion laptop support, in
++ *   particular for fan curve control and power mode.
++ *
++ *  Copyright (C) 2022 johnfan <johnfan (at) example (dot) com>
++ *
++ *
++ *  This driver might work on other Lenovo Legion models. If you
++ *  want to try it you can pass force=1 as argument
++ *  to the module which will force it to load even when the DMI
++ *  data doesn't match the model AND FIRMWARE.
++ *
++ *  Support for other hardware of this model is already partially
++ *  provided by the module ideapd-laptop.
++ *
++ *  The development page for this driver is located at
++ *  https://github.com/johnfanv2/LenovoLegionLinux
++ *
++ *  This driver exports the files:
++ *    - /sys/kernel/debug/legion/fancurve (ro)
++ *        The fan curve in the form stored in the firmware in an
++ *        human readable table.
++ *
++ *    - /sys/module/legion_laptop/drivers/platform\:legion/PNP0C09\:00/powermode (rw)
++ *       0: balanced mode (white)
++ *       1: performance mode (red)
++ *       2: quiet mode (blue)
++ *       ?: custom mode (pink)
++ *
++ *  NOTE: Writing to this will load the default fan curve from
++ *        the firmware for this mode, so the fan curve might
++ *        have to be reconfigured if needed.
++ *
++ *  It implements the usual hwmon interface to monitor fan speed and temmperature
++ *  and allows to set the fan curve inside the firware.
++ *
++ *    - /sys/class/hwmon/X/fan1_input or /sys/class/hwmon/X/fan2_input  (ro)
++ *        Current fan speed of fan1/fan2.
++ *    - /sys/class/hwmon/X/temp1_input (ro)
++ *    - /sys/class/hwmon/X/temp2_input (ro)
++ *    - /sys/class/hwmon/X/temp3_input (ro)
++ *        Temperature (Celsius) of CPU, GPU, and IC used for fan control.
++ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_pwm (rw)
++ *          PWM (0-255) of the fan at the Y-level in the fan curve
++ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp (rw)
++ *          upper temperature of tempZ (CPU, GPU, or IC) at the Y-level in the fan curve
++ *    - /sys/class/hwmon/X/pwmY_auto_pointZ_temp_hyst (rw)
++ *          hysteris (CPU, GPU, or IC) at the Y-level in the fan curve. The lower
++ *          temperatue of the level is the upper temperature minus the hysteris
++ *
++ *
++ *  Credits for reverse engineering the firmware to:
++ *      - David Woodhouse: heavily inspired by lenovo_laptop.c
++ *      - Luke Cama: Windows version "LegionFanControl"
++ *      - SmokelessCPU: reverse engineering of custom registers in EC
++ *                      and commincation method with EC via ports
++ *      - 0x1F9F1: additional reverse engineering for complete fan curve
++ */
++
++#include <linux/acpi.h>
++#include <asm/io.h>
++#include <linux/debugfs.h>
++#include <linux/delay.h>
++#include <linux/dmi.h>
++#include <linux/hwmon.h>
++#include <linux/hwmon-sysfs.h>
++#include <linux/kernel.h>
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/platform_device.h>
++#include <linux/platform_profile.h>
++#include <linux/types.h>
++#include <linux/wmi.h>
++
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("johnfan");
++MODULE_DESCRIPTION("Lenovo Legion laptop extras");
++
++static bool force;
++module_param(force, bool, 0440);
++MODULE_PARM_DESC(
++	force,
++	"Force loading this module even if model or BIOS does not match.");
++
++static bool ec_readonly;
++module_param(ec_readonly, bool, 0440);
++MODULE_PARM_DESC(
++	ec_readonly,
++	"Only read from embedded controller but do not write or change settings.");
++
++#define LEGIONFEATURES \
++	"fancurve powermode platformprofile platformprofilenotify minifancurve"
++
++//Size of fancurve stored in embedded controller
++#define MAXFANCURVESIZE 10
++
++#define LEGION_DRVR_SHORTNAME "legion"
++#define LEGION_HWMON_NAME LEGION_DRVR_SHORTNAME "_hwmon"
++
++/* =============================== */
++/* Embedded Controller Description */
++/* =============================== */
++
++/* The configuration and registers to access the embedded controller
++ * depending on different the version of the software on the
++ * embedded controller or and the BIOS/UEFI firmware.
++ *
++ * To control fan curve in the embedded controller (EC) one has to
++ * write to its "RAM". There are different possibilities:
++ *  - EC RAM is memory mapped (write to it with ioremap)
++ *  - access EC RAM via ported mapped IO (outb/inb)
++ *  - access EC RAM via ACPI methods. It is only possible to write
++ *    to part of it (first 0xFF bytes?)
++ *
++ * In later models the firmware directly exposes ACPI methods to
++ * set the fan curve direclty, without writing to EC RAM. This
++ * is done inside the ACPI method.
++ */
++
++/**
++ * Offsets for interseting values inside the EC RAM  (0 = start of
++ * EC RAM. These might change depending on the software inside of
++ * the EC, which can be updated by a BIOS update from Lenovo.
++ */
++// TODO: same order as in initialization
++struct ec_register_offsets {
++	// Super I/O Configuration Registers
++	// 7.15 General Control (GCTRL)
++	// General Control (GCTRL)
++	// (see EC Interface Registers  and 6.2 Plug and Play Configuration (PNPCFG)) in datasheet
++	// note: these are in two places saved
++	// in EC Interface Registers  and in super io configuraion registers
++	// Chip ID
++	u16 ECHIPID1;
++	u16 ECHIPID2;
++	// Chip Version
++	u16 ECHIPVER;
++	u16 ECDEBUG;
++
++	// Lenovo Custom OEM extension
++	// Firmware of ITE can be extended by
++	// custom program using its own "variables"
++	// These are the offsets to these "variables"
++	u16 EXT_FAN_CUR_POINT;
++	u16 EXT_FAN_POINTS_SIZE;
++	u16 EXT_FAN1_BASE;
++	u16 EXT_FAN2_BASE;
++	u16 EXT_FAN_ACC_BASE;
++	u16 EXT_FAN_DEC_BASE;
++	u16 EXT_CPU_TEMP;
++	u16 EXT_CPU_TEMP_HYST;
++	u16 EXT_GPU_TEMP;
++	u16 EXT_GPU_TEMP_HYST;
++	u16 EXT_VRM_TEMP;
++	u16 EXT_VRM_TEMP_HYST;
++	u16 EXT_FAN1_RPM_LSB;
++	u16 EXT_FAN1_RPM_MSB;
++	u16 EXT_FAN2_RPM_LSB;
++	u16 EXT_FAN2_RPM_MSB;
++	u16 EXT_FAN1_TARGET_RPM;
++	u16 EXT_FAN2_TARGET_RPM;
++	u16 EXT_POWERMODE;
++	u16 EXT_MINIFANCURVE_ON_COOL;
++	// values
++	// 0x04: enable mini fan curve if very long on cool level
++	//      - this might be due to potential temp failure
++	//      - or just because really so cool
++	// 0xA0: disable it
++	u16 EXT_LOCKFANCONTROLLER;
++	u16 EXT_MAXIMUMFANSPEED;
++	u16 EXT_WHITE_KEYBOARD_BACKLIGHT;
++	u16 EXT_IC_TEMP_INPUT;
++	u16 EXT_CPU_TEMP_INPUT;
++	u16 EXT_GPU_TEMP_INPUT;
++};
++
++struct model_config {
++	const struct ec_register_offsets *registers;
++	bool check_embedded_controller_id;
++	u16 embedded_controller_id;
++
++	// first addr in EC we access/scan
++	phys_addr_t memoryio_physical_ec_start;
++	size_t memoryio_size;
++
++	// TODO: maybe use bitfield
++	bool has_minifancurve;
++};
++
++/* =================================== */
++/* Coinfiguration for different models */
++/* =================================== */
++
++// Idea by SmokelesssCPU (modified)
++// - all default names and register addresses are supported by datasheet
++// - register addresses for custom firmware by SmokelesssCPU
++static const struct ec_register_offsets ec_register_offsets_v0 = {
++	.ECHIPID1 = 0x2000,
++	.ECHIPID2 = 0x2001,
++	.ECHIPVER = 0x2002,
++	.ECDEBUG = 0x2003,
++	.EXT_FAN_CUR_POINT = 0xC534,
++	.EXT_FAN_POINTS_SIZE = 0xC535,
++	.EXT_FAN1_BASE = 0xC540,
++	.EXT_FAN2_BASE = 0xC550,
++	.EXT_FAN_ACC_BASE = 0xC560,
++	.EXT_FAN_DEC_BASE = 0xC570,
++	.EXT_CPU_TEMP = 0xC580,
++	.EXT_CPU_TEMP_HYST = 0xC590,
++	.EXT_GPU_TEMP = 0xC5A0,
++	.EXT_GPU_TEMP_HYST = 0xC5B0,
++	.EXT_VRM_TEMP = 0xC5C0,
++	.EXT_VRM_TEMP_HYST = 0xC5D0,
++	.EXT_FAN1_RPM_LSB = 0xC5E0,
++	.EXT_FAN1_RPM_MSB = 0xC5E1,
++	.EXT_FAN2_RPM_LSB = 0xC5E2,
++	.EXT_FAN2_RPM_MSB = 0xC5E3,
++	.EXT_MINIFANCURVE_ON_COOL = 0xC536,
++	.EXT_LOCKFANCONTROLLER = 0xc4AB,
++	.EXT_CPU_TEMP_INPUT = 0xc538,
++	.EXT_GPU_TEMP_INPUT = 0xc539,
++	.EXT_IC_TEMP_INPUT = 0xC5E8,
++	.EXT_POWERMODE = 0xc420,
++	.EXT_FAN1_TARGET_RPM = 0xc600,
++	.EXT_FAN2_TARGET_RPM = 0xc601,
++	.EXT_MAXIMUMFANSPEED = 0xBD,
++	.EXT_WHITE_KEYBOARD_BACKLIGHT = (0x3B + 0xC400)
++};
++
++static const struct model_config model_v0 = {
++	.registers = &ec_register_offsets_v0,
++	.check_embedded_controller_id = true,
++	.embedded_controller_id = 0x8227,
++	.memoryio_physical_ec_start = 0xC400,
++	.memoryio_size = 0x300,
++	.has_minifancurve = true
++};
++
++static const struct model_config model_kfcn = {
++	.registers = &ec_register_offsets_v0,
++	.check_embedded_controller_id = true,
++	.embedded_controller_id = 0x8227,
++	.memoryio_physical_ec_start = 0xC400,
++	.memoryio_size = 0x300,
++	.has_minifancurve = false
++};
++
++static const struct model_config model_hacn = {
++	.registers = &ec_register_offsets_v0,
++	.check_embedded_controller_id = false,
++	.embedded_controller_id = 0x8227,
++	.memoryio_physical_ec_start = 0xC400,
++	.memoryio_size = 0x300,
++	.has_minifancurve = false
++};
++
++
++static const struct model_config model_k9cn = {
++	.registers = &ec_register_offsets_v0,
++	.check_embedded_controller_id = false,
++	.embedded_controller_id = 0x8227,
++	.memoryio_physical_ec_start = 0xC400, // or replace 0xC400 by 0x0400  ?
++	.memoryio_size = 0x300,
++	.has_minifancurve = false
++};
++
++
++
++static const struct dmi_system_id denylist[] = { {} };
++
++static const struct dmi_system_id optimistic_allowlist[] = {
++	{
++		// modelyear: 2021
++		// generation: 6
++		// name: Legion 5, Legion 5 pro, Legion 7
++		// Family: Legion 5 15ACH6H, ...
++		.ident = "GKCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "GKCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2020
++		.ident = "EUCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "EUCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2020
++		.ident = "EFCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "EFCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2020
++		.ident = "FSCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "FSCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2021
++		.ident = "HHCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "HHCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2022
++		.ident = "H1CN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "H1CN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2022
++		.ident = "J2CN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "J2CN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2022
++		.ident = "JUCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "JUCN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2022
++		.ident = "KFCN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "KFCN"),
++		},
++		.driver_data = (void *)&model_kfcn
++	},
++	{
++		// modelyear: 2021
++		.ident = "HACN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "HACN"),
++		},
++		.driver_data = (void *)&model_hacn
++	},
++	{
++		// modelyear: 2021
++		.ident = "G9CN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "G9CN"),
++		},
++		.driver_data = (void *)&model_v0
++	},
++	{
++		// modelyear: 2022
++		.ident = "K9CN",
++		.matches = {
++			DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"),
++			DMI_MATCH(DMI_BIOS_VERSION, "K9CN"),
++		},
++		.driver_data = (void *)&model_k9cn
++	},
++	{}
++};
++
++/* ================================= */
++/* ACPI access                       */
++/* ================================= */
++
++// function from ideapad-laptop.c
++static int eval_int(acpi_handle handle, const char *name, unsigned long *res)
++{
++	unsigned long long result;
++	acpi_status status;
++
++	status = acpi_evaluate_integer(handle, (char *)name, NULL, &result);
++	if (ACPI_FAILURE(status))
++		return -EIO;
++
++	*res = result;
++
++	return 0;
++}
++
++// function from ideapad-laptop.c
++static int exec_simple_method(acpi_handle handle, const char *name,
++			      unsigned long arg)
++{
++	acpi_status status =
++		acpi_execute_simple_method(handle, (char *)name, arg);
++
++	return ACPI_FAILURE(status) ? -EIO : 0;
++}
++
++// function from ideapad-laptop.c
++static int exec_sbmc(acpi_handle handle, unsigned long arg)
++{
++	// \_SB.PCI0.LPC0.EC0.VPC0.SBMC
++	return exec_simple_method(handle, "SBMC", arg);
++}
++
++static int eval_qcho(acpi_handle handle, unsigned long *res)
++{
++	// \_SB.PCI0.LPC0.EC0.QCHO
++	return eval_int(handle, "QCHO", res);
++}
++
++/* ================================= */
++/* EC RAM Access with port-mapped IO */
++/* ================================= */
++
++/*
++ * See datasheet of e.g. IT8502E/F/G, e.g.
++ * 6.2 Plug and Play Configuration (PNPCFG)
++ *
++ * Depending on configured BARDSEL register
++ * the ports
++ *   ECRAM_PORTIO_ADDR_PORT and
++ *   ECRAM_PORTIO_DATA_PORT
++ * are configured.
++ *
++ * By performing IO on these ports one can
++ * read/write to registers in the EC.
++ *
++ * "To access a register of PNPCFG, write target index to
++ *  address port and access this PNPCFG register via
++ *  data port" [datasheet, 6.2 Plug and Play Configuration]
++ */
++
++// IO ports used to write to communicate with embedded controller
++// Start of used ports
++#define ECRAM_PORTIO_START_PORT 0x4E
++// Number of used ports
++#define ECRAM_PORTIO_PORTS_SIZE 2
++// Port used to specify address in EC RAM to read/write
++// 0x4E/0x4F is the usual port for IO super controler
++// 0x2E/0x2F also common (ITE can also be configure to use these)
++#define ECRAM_PORTIO_ADDR_PORT 0x4E
++// Port to send/receive the value  to write/read
++#define ECRAM_PORTIO_DATA_PORT 0x4F
++// Name used to request ports
++#define ECRAM_PORTIO_NAME "legion"
++
++struct ecram_portio {
++	/* protects read/write to EC RAM performed
++	 * as a certain sequence of outb, inb
++	 * commands on the IO ports. There can
++	 * be at most one.
++	 */
++	struct mutex io_port_mutex;
++};
++
++ssize_t ecram_portio_init(struct ecram_portio *ec_portio)
++{
++	if (!request_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE,
++			    ECRAM_PORTIO_NAME)) {
++		pr_info("Cannot init ecram_portio the %x ports starting at %x\n",
++			ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
++		return -ENODEV;
++	}
++	//pr_info("Reserved %x ports starting at %x\n", ECRAM_PORTIO_PORTS_SIZE, ECRAM_PORTIO_START_PORT);
++	mutex_init(&ec_portio->io_port_mutex);
++	return 0;
++}
++
++void ecram_portio_exit(struct ecram_portio *ec_portio)
++{
++	release_region(ECRAM_PORTIO_START_PORT, ECRAM_PORTIO_PORTS_SIZE);
++}
++
++/* Read a byte from the EC RAM.
++ *
++ * Return status because of commong signature for alle
++ * methods to access EC RAM.
++ */
++ssize_t ecram_portio_read(struct ecram_portio *ec_portio, u16 offset, u8 *value)
++{
++	mutex_lock(&ec_portio->io_port_mutex);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x11, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	// TODO: no explicit cast between types seems to be sometimes
++	// done and sometimes not
++	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x10, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x12, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	*value = inb(ECRAM_PORTIO_DATA_PORT);
++
++	mutex_unlock(&ec_portio->io_port_mutex);
++	return 0;
++}
++
++/* Write a byte to the EC RAM.
++ *
++ * Return status because of commong signature for alle
++ * methods to access EC RAM.
++ */
++ssize_t ecram_portio_write(struct ecram_portio *ec_portio, u16 offset, u8 value)
++{
++	mutex_lock(&ec_portio->io_port_mutex);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x11, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	// TODO: no explicit cast between types seems to be sometimes
++	// done and sometimes not
++	outb((u8)((offset >> 8) & 0xFF), ECRAM_PORTIO_DATA_PORT);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x10, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	outb((u8)(offset & 0xFF), ECRAM_PORTIO_DATA_PORT);
++
++	outb(0x2E, ECRAM_PORTIO_ADDR_PORT);
++	outb(0x12, ECRAM_PORTIO_DATA_PORT);
++	outb(0x2F, ECRAM_PORTIO_ADDR_PORT);
++	outb(value, ECRAM_PORTIO_DATA_PORT);
++
++	mutex_unlock(&ec_portio->io_port_mutex);
++	return 0;
++}
++
++/* =================================== */
++/* EC RAM Access                       */
++/* =================================== */
++
++struct ecram {
++	struct ecram_portio portio;
++};
++
++ssize_t ecram_init(struct ecram *ecram, phys_addr_t memoryio_ec_physical_start,
++		   size_t region_size)
++{
++	ssize_t err;
++
++	err = ecram_portio_init(&ecram->portio);
++	if (err) {
++		pr_info("Failed ecram_portio_init\n");
++		goto err_ecram_portio_init;
++	}
++
++	return 0;
++
++err_ecram_portio_init:
++	return err;
++}
++
++void ecram_exit(struct ecram *ecram)
++{
++	pr_info("Unloading legion ecram\n");
++	ecram_portio_exit(&ecram->portio);
++	pr_info("Unloading legion ecram done\n");
++}
++
++/**
++ * ecram_offset address on the EC
++ */
++static u8 ecram_read(struct ecram *ecram, u16 ecram_offset)
++{
++	u8 value;
++	int err;
++
++	err = ecram_portio_read(&ecram->portio, ecram_offset, &value);
++	if (err)
++		pr_info("Error reading EC RAM at 0x%x\n", ecram_offset);
++	return value;
++}
++
++static void ecram_write(struct ecram *ecram, u16 ecram_offset, u8 value)
++{
++	int err;
++
++	if (ec_readonly) {
++		pr_info("Skipping writing EC RAM at 0x%x because readonly.\n",
++			ecram_offset);
++		return;
++	}
++	err = ecram_portio_write(&ecram->portio, ecram_offset, value);
++	if (err)
++		pr_info("Error writing EC RAM at 0x%x\n", ecram_offset);
++}
++
++/* =============================== */
++/* Reads from EC  */
++/* ===============================  */
++
++u16 read_ec_id(struct ecram *ecram, const struct model_config *model)
++{
++	u8 id1 = ecram_read(ecram, model->registers->ECHIPID1);
++	u8 id2 = ecram_read(ecram, model->registers->ECHIPID2);
++
++	return (id1 << 8) + id2;
++}
++
++u16 read_ec_version(struct ecram *ecram, const struct model_config *model)
++{
++	u8 vers = ecram_read(ecram, model->registers->ECHIPVER);
++	u8 debug = ecram_read(ecram, model->registers->ECDEBUG);
++
++	return (vers << 8) + debug;
++}
++
++/* ============================= */
++/* Data model for sensor values  */
++/* ============================  */
++
++struct sensor_values {
++	u16 fan1_rpm; // current speed in rpm of fan 1
++	u16 fan2_rpm; // current speed in rpm of fan2
++	u16 fan1_target_rpm; // target speed in rpm of fan 1
++	u16 fan2_target_rpm; // target speed in rpm of fan 2
++	u8 cpu_temp_celsius; // cpu temperature in celcius
++	u8 gpu_temp_celsius; // gpu temperature in celcius
++	u8 ic_temp_celsius; // ic temperature in celcius
++};
++
++enum SENSOR_ATTR {
++	SENSOR_CPU_TEMP_ID = 1,
++	SENSOR_GPU_TEMP_ID = 2,
++	SENSOR_IC_TEMP_ID = 3,
++	SENSOR_FAN1_RPM_ID = 4,
++	SENSOR_FAN2_RPM_ID = 5,
++	SENSOR_FAN1_TARGET_RPM_ID = 6,
++	SENSOR_FAN2_TARGET_RPM_ID = 7
++};
++
++static int read_sensor_values(struct ecram *ecram,
++			      const struct model_config *model,
++			      struct sensor_values *values)
++{
++	values->fan1_target_rpm =
++		100 * ecram_read(ecram, model->registers->EXT_FAN1_TARGET_RPM);
++	values->fan2_target_rpm =
++		100 * ecram_read(ecram, model->registers->EXT_FAN2_TARGET_RPM);
++
++	values->fan1_rpm =
++		ecram_read(ecram, model->registers->EXT_FAN1_RPM_LSB) +
++		(((int)ecram_read(ecram, model->registers->EXT_FAN1_RPM_MSB))
++		 << 8);
++	values->fan2_rpm =
++		ecram_read(ecram, model->registers->EXT_FAN2_RPM_LSB) +
++		(((int)ecram_read(ecram, model->registers->EXT_FAN2_RPM_MSB))
++		 << 8);
++
++	values->cpu_temp_celsius =
++		ecram_read(ecram, model->registers->EXT_CPU_TEMP_INPUT);
++	values->gpu_temp_celsius =
++		ecram_read(ecram, model->registers->EXT_GPU_TEMP_INPUT);
++	values->ic_temp_celsius =
++		ecram_read(ecram, model->registers->EXT_IC_TEMP_INPUT);
++
++	values->cpu_temp_celsius = ecram_read(ecram, 0xC5E6);
++	values->gpu_temp_celsius = ecram_read(ecram, 0xC5E7);
++	values->ic_temp_celsius = ecram_read(ecram, 0xC5E8);
++
++	return 0;
++}
++
++/* =============================== */
++/* Behaviour changing functions    */
++/* =============================== */
++
++int read_powermode(struct ecram *ecram, const struct model_config *model)
++{
++	return ecram_read(ecram, model->registers->EXT_POWERMODE);
++}
++
++ssize_t write_powermode(struct ecram *ecram, const struct model_config *model,
++			u8 value)
++{
++	if (!(value >= 0 && value <= 2)) {
++		pr_info("Unexpected power mode value ignored: %d\n", value);
++		return -ENOMEM;
++	}
++	ecram_write(ecram, model->registers->EXT_POWERMODE, value);
++	return 0;
++}
++
++/**
++ * Shortly toggle powermode to a different mode
++ * and switch back, e.g. to reset fan curve.
++ */
++void toggle_powermode(struct ecram *ecram, const struct model_config *model)
++{
++	int old_powermode = read_powermode(ecram, model);
++	int next_powermode = old_powermode == 0 ? 1 : 0;
++
++	write_powermode(ecram, model, next_powermode);
++	mdelay(1500);
++	write_powermode(ecram, model, old_powermode);
++}
++
++#define lockfancontroller_ON 8
++#define lockfancontroller_OFF 0
++
++ssize_t write_lockfancontroller(struct ecram *ecram,
++				const struct model_config *model, bool state)
++{
++	u8 val = state ? lockfancontroller_ON : lockfancontroller_OFF;
++
++	ecram_write(ecram, model->registers->EXT_LOCKFANCONTROLLER, val);
++	return 0;
++}
++
++int read_lockfancontroller(struct ecram *ecram,
++			   const struct model_config *model, bool *state)
++{
++	int value = ecram_read(ecram, model->registers->EXT_LOCKFANCONTROLLER);
++
++	switch (value) {
++	case lockfancontroller_ON:
++		*state = true;
++		break;
++	case lockfancontroller_OFF:
++		*state = false;
++		break;
++	default:
++		pr_info("Unexpected value in lockfanspeed register:%d\n",
++			value);
++		return -1;
++	}
++	return 0;
++}
++
++#define MAXIMUMFANSPEED_ON 0x40
++#define MAXIMUMFANSPEED_OFF 0x00
++
++int read_maximumfanspeed(struct ecram *ecram, const struct model_config *model,
++			 bool *state)
++{
++	int value = ecram_read(ecram, model->registers->EXT_MAXIMUMFANSPEED);
++
++	switch (value) {
++	case MAXIMUMFANSPEED_ON:
++		*state = true;
++		break;
++	case MAXIMUMFANSPEED_OFF:
++		*state = false;
++		break;
++	default:
++		pr_info("Unexpected value in maximumfanspeed register:%d\n",
++			value);
++		return -1;
++	}
++	return 0;
++}
++
++ssize_t write_maximumfanspeed(struct ecram *ecram,
++			      const struct model_config *model, bool state)
++{
++	u8 val = state ? MAXIMUMFANSPEED_ON : MAXIMUMFANSPEED_OFF;
++
++	ecram_write(ecram, model->registers->EXT_MAXIMUMFANSPEED, val);
++	return 0;
++}
++
++#define MINIFANCUVE_ON_COOL_ON 0x04
++#define MINIFANCUVE_ON_COOL_OFF 0xA0
++
++int read_minifancurve(struct ecram *ecram, const struct model_config *model,
++		      bool *state)
++{
++	int value =
++		ecram_read(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL);
++
++	switch (value) {
++	case MINIFANCUVE_ON_COOL_ON:
++		*state = true;
++		break;
++	case MINIFANCUVE_ON_COOL_OFF:
++		*state = false;
++		break;
++	default:
++		pr_info("Unexpected value in MINIFANCURVE register:%d\n",
++			value);
++		return -1;
++	}
++	return 0;
++}
++
++ssize_t write_minifancurve(struct ecram *ecram,
++			   const struct model_config *model, bool state)
++{
++	u8 val = state ? MINIFANCUVE_ON_COOL_ON : MINIFANCUVE_ON_COOL_OFF;
++
++	ecram_write(ecram, model->registers->EXT_MINIFANCURVE_ON_COOL, val);
++	return 0;
++}
++
++#define KEYBOARD_BACKLIGHT_OFF 18
++#define KEYBOARD_BACKLIGHT_ON1 21
++#define KEYBOARD_BACKLIGHT_ON2 23
++
++int read_keyboard_backlight(struct ecram *ecram,
++			    const struct model_config *model, int *state)
++{
++	int value = ecram_read(ecram,
++			       model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT);
++
++	//switch (value) {
++	//case MINIFANCUVE_ON_COOL_ON:
++	//	*state = true;
++	//	break;
++	//case MINIFANCUVE_ON_COOL_OFF:
++	//	*state = false;
++	//	break;
++	//default:
++	//	pr_info("Unexpected value in MINIFANCURVE register:%d\n",
++	//		value);
++	//	return -1;
++	//}
++	*state = value;
++	return 0;
++}
++
++int write_keyboard_backlight(struct ecram *ecram,
++			     const struct model_config *model, int state)
++{
++	u8 val = state > 0 ? KEYBOARD_BACKLIGHT_ON1 : KEYBOARD_BACKLIGHT_OFF;
++
++	ecram_write(ecram, model->registers->EXT_WHITE_KEYBOARD_BACKLIGHT, val);
++	return 0;
++}
++
++#define FCT_RAPID_CHARGE_ON 0x07
++#define FCT_RAPID_CHARGE_OFF 0x08
++#define RAPID_CHARGE_ON 0x0
++#define RAPID_CHARGE_OFF 0x1
++
++int read_rapidcharge(acpi_handle acpihandle, int *state)
++{
++	unsigned long result;
++	int err;
++
++	err = eval_qcho(acpihandle, &result);
++	if (err)
++		return err;
++
++	*state = result;
++	return 0;
++}
++
++int write_rapidcharge(acpi_handle acpihandle, bool state)
++{
++	unsigned long fct_nr = state > 0 ? FCT_RAPID_CHARGE_ON :
++					   FCT_RAPID_CHARGE_OFF;
++	return exec_sbmc(acpihandle, fct_nr);
++}
++
++/* ============================= */
++/* Data model for fan curve      */
++/* ============================  */
++
++struct fancurve_point {
++	// rpm1 devided by 100
++	u8 rpm1_raw;
++	// rpm2 devided by 100
++	u8 rpm2_raw;
++	// >=2 , <=5 (lower is faster); must be increasing by level
++	u8 accel;
++	// >=2 , <=5 (lower is faster); must be increasing by level
++	u8 decel;
++
++	// min must be lower or equal than max
++	// last level max must be 127
++	// <=127 cpu max temp for this level; must be increasing by level
++	u8 cpu_max_temp_celsius;
++	// <=127 cpu min temp for this level; must be increasing by level
++	u8 cpu_min_temp_celsius;
++	// <=127 gpu min temp for this level; must be increasing by level
++	u8 gpu_max_temp_celsius;
++	// <=127 gpu max temp for this level; must be increasing by level
++	u8 gpu_min_temp_celsius;
++	// <=127 ic max temp for this level; must be increasing by level
++	u8 ic_max_temp_celsius;
++	// <=127 ic max temp for this level; must be increasing by level
++	u8 ic_min_temp_celsius;
++};
++
++enum FANCURVE_ATTR {
++	FANCURVE_ATTR_PWM1 = 1,
++	FANCURVE_ATTR_PWM2 = 2,
++	FANCURVE_ATTR_CPU_TEMP = 3,
++	FANCURVE_ATTR_CPU_HYST = 4,
++	FANCURVE_ATTR_GPU_TEMP = 5,
++	FANCURVE_ATTR_GPU_HYST = 6,
++	FANCURVE_ATTR_IC_TEMP = 7,
++	FANCURVE_ATTR_IC_HYST = 8,
++	FANCURVE_ATTR_ACCEL = 9,
++	FANCURVE_ATTR_DECEL = 10,
++	FANCURVE_SIZE = 11,
++	FANCURVE_MINIFANCURVE_ON_COOL = 12
++};
++
++// used for clearing table entries
++static const struct fancurve_point fancurve_point_zero = { 0, 0, 0, 0, 0,
++							   0, 0, 0, 0, 0 };
++
++struct fancurve {
++	struct fancurve_point points[MAXFANCURVESIZE];
++	// number of points used; must be <= MAXFANCURVESIZE
++	size_t size;
++	// the point that at which fans are run currently
++	size_t current_point_i;
++};
++
++// calculate derived values
++
++int fancurve_get_cpu_deltahyst(struct fancurve_point *point)
++{
++	return ((int)point->cpu_max_temp_celsius) -
++	       ((int)point->cpu_min_temp_celsius);
++}
++
++int fancurve_get_gpu_deltahyst(struct fancurve_point *point)
++{
++	return ((int)point->gpu_max_temp_celsius) -
++	       ((int)point->gpu_min_temp_celsius);
++}
++
++int fancurve_get_ic_deltahyst(struct fancurve_point *point)
++{
++	return ((int)point->ic_max_temp_celsius) -
++	       ((int)point->ic_min_temp_celsius);
++}
++
++// validation functions
++
++bool fancurve_is_valid_min_temp(int min_temp)
++{
++	return min_temp >= 0 && min_temp <= 127;
++}
++
++bool fancurve_is_valid_max_temp(int max_temp)
++{
++	return max_temp >= 0 && max_temp <= 127;
++}
++
++// setters with validation
++// - make hwmon implementation easier
++// - keep fancurve valid, otherwise EC will not properly control fan
++
++bool fancurve_set_rpm1(struct fancurve *fancurve, int point_id, int rpm)
++{
++	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
++
++	if (valid)
++		fancurve->points[point_id].rpm1_raw = rpm / 100;
++	return valid;
++}
++
++bool fancurve_set_rpm2(struct fancurve *fancurve, int point_id, int rpm)
++{
++	bool valid = point_id == 0 ? rpm == 0 : (rpm >= 0 && rpm <= 4500);
++
++	if (valid)
++		fancurve->points[point_id].rpm2_raw = rpm / 100;
++	return valid;
++}
++
++// TODO: remove { ... } from single line if body
++
++bool fancurve_set_accel(struct fancurve *fancurve, int point_id, int accel)
++{
++	bool valid = accel >= 2 && accel <= 5;
++
++	if (valid)
++		fancurve->points[point_id].accel = accel;
++	return valid;
++}
++
++bool fancurve_set_decel(struct fancurve *fancurve, int point_id, int decel)
++{
++	bool valid = decel >= 2 && decel <= 5;
++
++	if (valid)
++		fancurve->points[point_id].decel = decel;
++	return valid;
++}
++
++bool fancurve_set_cpu_temp_max(struct fancurve *fancurve, int point_id,
++			       int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].cpu_max_temp_celsius = value;
++
++	return valid;
++}
++
++bool fancurve_set_gpu_temp_max(struct fancurve *fancurve, int point_id,
++			       int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].gpu_max_temp_celsius = value;
++	return valid;
++}
++
++bool fancurve_set_ic_temp_max(struct fancurve *fancurve, int point_id,
++			      int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].ic_max_temp_celsius = value;
++	return valid;
++}
++
++bool fancurve_set_cpu_temp_min(struct fancurve *fancurve, int point_id,
++			       int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].cpu_min_temp_celsius = value;
++	return valid;
++}
++
++bool fancurve_set_gpu_temp_min(struct fancurve *fancurve, int point_id,
++			       int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].gpu_min_temp_celsius = value;
++	return valid;
++}
++
++bool fancurve_set_ic_temp_min(struct fancurve *fancurve, int point_id,
++			      int value)
++{
++	bool valid = fancurve_is_valid_max_temp(value);
++
++	if (valid)
++		fancurve->points[point_id].ic_min_temp_celsius = value;
++	return valid;
++}
++
++bool fancurve_set_size(struct fancurve *fancurve, int size, bool init_values)
++{
++	bool valid = size >= 1 && size <= MAXFANCURVESIZE;
++
++	if (!valid)
++		return false;
++	if (init_values && size < fancurve->size) {
++		// fancurve size is decreased, but last etnry alwasy needs 127 temperatures
++		// Note: size >=1
++		fancurve->points[size - 1].cpu_max_temp_celsius = 127;
++		fancurve->points[size - 1].ic_max_temp_celsius = 127;
++		fancurve->points[size - 1].gpu_max_temp_celsius = 127;
++	}
++	if (init_values && size > fancurve->size) {
++		// fancurve increased, so new entries need valid values
++		int i;
++		int last = fancurve->size > 0 ? fancurve->size - 1 : 0;
++
++		for (i = fancurve->size; i < size; ++i)
++			fancurve->points[i] = fancurve->points[last];
++	}
++	return true;
++}
++
++/* Read the fan curve from the EC.
++ *
++ * In newer models (>=2022) there is an ACPI/WMI to read fan curve as
++ * a whole. So read/write fan table as a whole to use
++ * same interface for both cases.
++ *
++ * It reads all points from EC memory, even if stored fancurve is smaller, so
++ * it can contain 0 entries.
++ */
++static int read_fancurve(struct ecram *ecram, const struct model_config *model,
++			 struct fancurve *fancurve)
++{
++	size_t i = 0;
++
++	for (i = 0; i < MAXFANCURVESIZE; ++i) {
++		struct fancurve_point *point = &fancurve->points[i];
++
++		point->rpm1_raw =
++			ecram_read(ecram, model->registers->EXT_FAN1_BASE + i);
++		point->rpm2_raw =
++			ecram_read(ecram, model->registers->EXT_FAN2_BASE + i);
++
++		point->accel = ecram_read(
++			ecram, model->registers->EXT_FAN_ACC_BASE + i);
++		point->decel = ecram_read(
++			ecram, model->registers->EXT_FAN_DEC_BASE + i);
++		point->cpu_max_temp_celsius =
++			ecram_read(ecram, model->registers->EXT_CPU_TEMP + i);
++		point->cpu_min_temp_celsius = ecram_read(
++			ecram, model->registers->EXT_CPU_TEMP_HYST + i);
++		point->gpu_max_temp_celsius =
++			ecram_read(ecram, model->registers->EXT_GPU_TEMP + i);
++		point->gpu_min_temp_celsius = ecram_read(
++			ecram, model->registers->EXT_GPU_TEMP_HYST + i);
++		point->ic_max_temp_celsius =
++			ecram_read(ecram, model->registers->EXT_VRM_TEMP + i);
++		point->ic_min_temp_celsius = ecram_read(
++			ecram, model->registers->EXT_VRM_TEMP_HYST + i);
++	}
++
++	// Do not trust that hardware; It might suddendly report
++	// a larger size, so clamp it.
++	fancurve->size =
++		ecram_read(ecram, model->registers->EXT_FAN_POINTS_SIZE);
++	fancurve->size =
++		min(fancurve->size, (typeof(fancurve->size))(MAXFANCURVESIZE));
++	fancurve->current_point_i =
++		ecram_read(ecram, model->registers->EXT_FAN_CUR_POINT);
++	fancurve->current_point_i =
++		min(fancurve->current_point_i, fancurve->size);
++	return 0;
++}
++
++static int write_fancurve(struct ecram *ecram, const struct model_config *model,
++			  const struct fancurve *fancurve, bool write_size)
++{
++	size_t i;
++	// Reset fan update counters (try to avoid any race conditions)
++	ecram_write(ecram, 0xC5FE, 0);
++	ecram_write(ecram, 0xC5FF, 0);
++	for (i = 0; i < MAXFANCURVESIZE; ++i) {
++		// Entries for points larger than fancurve size should be cleared
++		// to 0
++		const struct fancurve_point *point =
++			i < fancurve->size ? &fancurve->points[i] :
++					     &fancurve_point_zero;
++
++		ecram_write(ecram, model->registers->EXT_FAN1_BASE + i,
++			    point->rpm1_raw);
++		ecram_write(ecram, model->registers->EXT_FAN2_BASE + i,
++			    point->rpm2_raw);
++
++		ecram_write(ecram, model->registers->EXT_FAN_ACC_BASE + i,
++			    point->accel);
++		ecram_write(ecram, model->registers->EXT_FAN_DEC_BASE + i,
++			    point->decel);
++
++		ecram_write(ecram, model->registers->EXT_CPU_TEMP + i,
++			    point->cpu_max_temp_celsius);
++		ecram_write(ecram, model->registers->EXT_CPU_TEMP_HYST + i,
++			    point->cpu_min_temp_celsius);
++		ecram_write(ecram, model->registers->EXT_GPU_TEMP + i,
++			    point->gpu_max_temp_celsius);
++		ecram_write(ecram, model->registers->EXT_GPU_TEMP_HYST + i,
++			    point->gpu_min_temp_celsius);
++		ecram_write(ecram, model->registers->EXT_VRM_TEMP + i,
++			    point->ic_max_temp_celsius);
++		ecram_write(ecram, model->registers->EXT_VRM_TEMP_HYST + i,
++			    point->ic_min_temp_celsius);
++	}
++
++	if (write_size) {
++		ecram_write(ecram, model->registers->EXT_FAN_POINTS_SIZE,
++			    fancurve->size);
++	}
++
++	// Reset current fan level to 0, so algorithm in EC
++	// selects fan curve point again and resetting hysterisis
++	// effects
++	ecram_write(ecram, model->registers->EXT_FAN_CUR_POINT, 0);
++
++	// Reset internal fan levels
++	ecram_write(ecram, 0xC634, 0); // CPU
++	ecram_write(ecram, 0xC635, 0); // GPU
++	ecram_write(ecram, 0xC636, 0); // SENSOR
++
++	return 0;
++}
++
++static ssize_t fancurve_print_seqfile(const struct fancurve *fancurve,
++				      struct seq_file *s)
++{
++	int i;
++
++	seq_printf(
++		s,
++		"rpm1|rpm2|acceleration|deceleration|cpu_min_temp|cpu_max_temp|gpu_min_temp|gpu_max_temp|ic_min_temp|ic_max_temp\n");
++	for (i = 0; i < fancurve->size; ++i) {
++		const struct fancurve_point *point = &fancurve->points[i];
++
++		seq_printf(
++			s, "%d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\t %d\n",
++			point->rpm1_raw * 100, point->rpm2_raw * 100,
++			point->accel, point->decel, point->cpu_min_temp_celsius,
++			point->cpu_max_temp_celsius,
++			point->gpu_min_temp_celsius,
++			point->gpu_max_temp_celsius, point->ic_min_temp_celsius,
++			point->ic_max_temp_celsius);
++	}
++	return 0;
++}
++
++/* =============================  */
++/* Global and shared data between */
++/* all calls to this module       */
++/* ============================   */
++// Implemented like ideapad-laptop.c but currenlty still
++// wihtout dynamic memory allocation (instaed global _priv)
++
++struct legion_private {
++	struct platform_device *platform_device;
++	// TODO: remove or keep? init?
++	// struct acpi_device *adev;
++
++	// Method to access ECRAM
++	struct ecram ecram;
++	// Configuration with registers an ECRAM access method
++	const struct model_config *conf;
++
++	// TODO: maybe refactor an keep only local to each function
++	// last known fan curve
++	struct fancurve fancurve;
++	// configured fan curve from user space
++	struct fancurve fancurve_configured;
++
++	// update lock, when partial values of fancurve are changed
++	struct mutex fancurve_mutex;
++
++	//interfaces
++	struct dentry *debugfs_dir;
++	struct device *hwmon_dev;
++	struct platform_profile_handler platform_profile_handler;
++
++	// TODO: remove?
++	bool loaded;
++};
++
++// shared between different drivers: WMI, platform and proteced by mutex
++static struct legion_private *legion_shared;
++static struct legion_private _priv;
++static DEFINE_MUTEX(legion_shared_mutex);
++
++static int legion_shared_init(struct legion_private *priv)
++{
++	int ret;
++
++	mutex_lock(&legion_shared_mutex);
++
++	if (!legion_shared) {
++		legion_shared = priv;
++		mutex_init(&legion_shared->fancurve_mutex);
++		ret = 0;
++	} else {
++		pr_warn("Found multiple platform devices\n");
++		ret = -EINVAL;
++	}
++
++	priv->loaded = true;
++	mutex_unlock(&legion_shared_mutex);
++
++	return ret;
++}
++
++static void legion_shared_exit(struct legion_private *priv)
++{
++	pr_info("Unloading legion shared\n");
++	mutex_lock(&legion_shared_mutex);
++
++	if (legion_shared == priv)
++		legion_shared = NULL;
++
++	mutex_unlock(&legion_shared_mutex);
++	pr_info("Unloading legion shared done\n");
++}
++
++/* =============================  */
++/* debugfs interface              */
++/* ============================   */
++
++static int debugfs_ecmemory_show(struct seq_file *s, void *unused)
++{
++	struct legion_private *priv = s->private;
++	size_t offset;
++
++	for (offset = 0; offset < priv->conf->memoryio_size; ++offset) {
++		char value = ecram_read(&priv->ecram,
++					priv->conf->memoryio_physical_ec_start +
++						offset);
++
++		seq_write(s, &value, 1);
++	}
++	return 0;
++}
++
++DEFINE_SHOW_ATTRIBUTE(debugfs_ecmemory);
++
++static int debugfs_fancurve_show(struct seq_file *s, void *unused)
++{
++	struct legion_private *priv = s->private;
++	bool is_minifancurve;
++	bool is_lockfancontroller;
++	bool is_maximumfanspeed;
++	int err;
++
++	seq_printf(s, "EC Chip ID: %x\n", read_ec_id(&priv->ecram, priv->conf));
++	seq_printf(s, "EC Chip Version: %x\n",
++		   read_ec_version(&priv->ecram, priv->conf));
++	seq_printf(s, "legion_laptop features: %s\n", LEGIONFEATURES);
++	seq_printf(s, "legion_laptop ec_readonly: %d\n", ec_readonly);
++	read_fancurve(&priv->ecram, priv->conf, &priv->fancurve);
++
++	seq_printf(s, "minifancurve feature enabled: %d\n",
++		   priv->conf->has_minifancurve);
++	err = read_minifancurve(&priv->ecram, priv->conf, &is_minifancurve);
++	seq_printf(s, "minifancurve on cool: %s\n",
++		   err ? "error" : (is_minifancurve ? "true" : "false"));
++	err = read_lockfancontroller(&priv->ecram, priv->conf,
++				     &is_lockfancontroller);
++	seq_printf(s, "lock fan controller: %s\n",
++		   err ? "error" : (is_lockfancontroller ? "true" : "false"));
++	err = read_maximumfanspeed(&priv->ecram, priv->conf,
++				   &is_maximumfanspeed);
++	seq_printf(s, "enable maximumfanspeed: %s\n",
++		   err ? "error" : (is_maximumfanspeed ? "true" : "false"));
++	seq_printf(s, "enable maximumfanspeed status: %d\n", err);
++
++	seq_printf(s, "fan curve current point id: %ld\n",
++		   priv->fancurve.current_point_i);
++	seq_printf(s, "fan curve points size: %ld\n", priv->fancurve.size);
++
++	seq_puts(s, "Current fan curve in hardware (embedded controller):\n");
++	fancurve_print_seqfile(&priv->fancurve, s);
++	seq_puts(s, "=====================\n");
++	return 0;
++}
++
++DEFINE_SHOW_ATTRIBUTE(debugfs_fancurve);
++
++static void legion_debugfs_init(struct legion_private *priv)
++{
++	struct dentry *dir;
++
++	// TODO: remove this note
++	// Note: as other kernel modules, do not catch errors here
++	// because if kernel is build without debugfs this
++	// will return an error but module still has to
++	// work, just without debugfs
++	// TODO: what permissions; some modules do 400
++	// other do 444
++	dir = debugfs_create_dir(LEGION_DRVR_SHORTNAME, NULL);
++	debugfs_create_file("fancurve", 0444, dir, priv,
++			    &debugfs_fancurve_fops);
++	debugfs_create_file("ecmemory", 0444, dir, priv,
++			    &debugfs_ecmemory_fops);
++
++	priv->debugfs_dir = dir;
++}
++
++static void legion_debugfs_exit(struct legion_private *priv)
++{
++	pr_info("Unloading legion dubugfs\n");
++	// The following is does nothing if pointer is NULL
++	debugfs_remove_recursive(priv->debugfs_dir);
++	priv->debugfs_dir = NULL;
++	pr_info("Unloading legion dubugfs done\n");
++}
++
++/* =============================  */
++/* sysfs interface                */
++/* ============================   */
++
++static ssize_t powermode_show(struct device *dev, struct device_attribute *attr,
++			      char *buf)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int power_mode = read_powermode(&priv->ecram, priv->conf);
++
++	return sysfs_emit(buf, "%d\n", power_mode);
++}
++
++static ssize_t powermode_store(struct device *dev,
++			       struct device_attribute *attr, const char *buf,
++			       size_t count)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int powermode;
++	int err;
++
++	err = kstrtouint(buf, 0, &powermode);
++	if (err)
++		return err;
++
++	err = write_powermode(&priv->ecram, priv->conf, powermode);
++	if (err)
++		return -EINVAL;
++
++	// TODO: better?
++	// we have to wait a bit before change is done in hardware and
++	// readback done after notifying returns correct value, otherwise
++	// the notified reader will read old value
++	msleep(500);
++	platform_profile_notify();
++
++	return count;
++}
++
++static DEVICE_ATTR_RW(powermode);
++
++static ssize_t lockfancontroller_show(struct device *dev,
++				      struct device_attribute *attr, char *buf)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	bool is_lockfancontroller;
++	int err;
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = read_lockfancontroller(&priv->ecram, priv->conf,
++				     &is_lockfancontroller);
++	mutex_unlock(&priv->fancurve_mutex);
++	if (err)
++		return -EINVAL;
++
++	return sysfs_emit(buf, "%d\n", is_lockfancontroller);
++}
++
++static ssize_t lockfancontroller_store(struct device *dev,
++				       struct device_attribute *attr,
++				       const char *buf, size_t count)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	bool is_lockfancontroller;
++	int err;
++
++	err = kstrtobool(buf, &is_lockfancontroller);
++	if (err)
++		return err;
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = write_lockfancontroller(&priv->ecram, priv->conf,
++				      is_lockfancontroller);
++	mutex_unlock(&priv->fancurve_mutex);
++	if (err)
++		return -EINVAL;
++
++	return count;
++}
++
++static DEVICE_ATTR_RW(lockfancontroller);
++
++static ssize_t keyboard_backlight_show(struct device *dev,
++				       struct device_attribute *attr, char *buf)
++{
++	int state;
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	read_keyboard_backlight(&priv->ecram, priv->conf, &state);
++	return sysfs_emit(buf, "%d\n", state);
++}
++
++static ssize_t keyboard_backlight_store(struct device *dev,
++					struct device_attribute *attr,
++					const char *buf, size_t count)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int state;
++	int err;
++
++	err = kstrtouint(buf, 0, &state);
++	if (err)
++		return err;
++
++	err = write_keyboard_backlight(&priv->ecram, priv->conf, state);
++	if (err)
++		return -EINVAL;
++
++	return count;
++}
++
++static DEVICE_ATTR_RW(keyboard_backlight);
++
++static struct attribute *legion_sysfs_attributes[] = {
++	&dev_attr_powermode.attr, &dev_attr_lockfancontroller.attr,
++	&dev_attr_keyboard_backlight.attr, NULL
++};
++
++static const struct attribute_group legion_attribute_group = {
++	.attrs = legion_sysfs_attributes
++};
++
++static int legion_sysfs_init(struct legion_private *priv)
++{
++	return device_add_group(&priv->platform_device->dev,
++				&legion_attribute_group);
++}
++
++static void legion_sysfs_exit(struct legion_private *priv)
++{
++	pr_info("Unloading legion sysfs\n");
++	device_remove_group(&priv->platform_device->dev,
++			    &legion_attribute_group);
++	pr_info("Unloading legion sysfs done\n");
++}
++
++/* =============================  */
++/* WMI + ACPI                     */
++/* ============================   */
++// heavily based on ideapad_laptop.c
++
++// TODO: proper names if meaning of all events is clear
++enum LEGION_WMI_EVENT {
++	LEGION_WMI_EVENT_GAMEZONE = 1,
++	LEGION_EVENT_A,
++	LEGION_EVENT_B,
++	LEGION_EVENT_C,
++	LEGION_EVENT_D,
++	LEGION_EVENT_E,
++	LEGION_EVENT_F,
++	LEGION_EVENT_G
++};
++
++struct legion_wmi_private {
++	enum LEGION_WMI_EVENT event;
++};
++
++//static void legion_wmi_notify2(u32 value, void *context)
++//    {
++//	pr_info("WMI notify\n" );
++//    }
++
++static void legion_wmi_notify(struct wmi_device *wdev, union acpi_object *data)
++{
++	struct legion_wmi_private *wpriv;
++	struct legion_private *priv;
++
++	mutex_lock(&legion_shared_mutex);
++	priv = legion_shared;
++	if ((!priv) && (priv->loaded)) {
++		pr_info("Received WMI event while not initialized!\n");
++		goto unlock;
++	}
++
++	wpriv = dev_get_drvdata(&wdev->dev);
++	switch (wpriv->event) {
++	case LEGION_EVENT_A:
++		pr_info("Fan event: legion type: %d;  acpi type: %d (%d=integer)",
++			wpriv->event, data->type, ACPI_TYPE_INTEGER);
++		// TODO: here it is too early (first unlock mutext, then wait a bit)
++		//platform_profile_notify();
++		break;
++	default:
++		pr_info("Event: legion type: %d;  acpi type: %d (%d=integer)",
++			wpriv->event, data->type, ACPI_TYPE_INTEGER);
++		break;
++	}
++
++unlock:
++	mutex_unlock(&legion_shared_mutex);
++	// todo; fix that!
++	// problem: we get a event just before the powermode change (from the key?),
++	// so if we notify to early, it will read the old power mode/platform profile
++	msleep(500);
++	platform_profile_notify();
++}
++
++static int legion_wmi_probe(struct wmi_device *wdev, const void *context)
++{
++	struct legion_wmi_private *wpriv;
++
++	wpriv = devm_kzalloc(&wdev->dev, sizeof(*wpriv), GFP_KERNEL);
++	if (!wpriv)
++		return -ENOMEM;
++
++	*wpriv = *(const struct legion_wmi_private *)context;
++
++	dev_set_drvdata(&wdev->dev, wpriv);
++	dev_info(&wdev->dev, "Register after probing for WMI.\n");
++	return 0;
++}
++
++static const struct legion_wmi_private legion_wmi_context_gamezone = {
++	.event = LEGION_WMI_EVENT_GAMEZONE
++};
++static const struct legion_wmi_private legion_wmi_context_a = {
++	.event = LEGION_EVENT_A
++};
++static const struct legion_wmi_private legion_wmi_context_b = {
++	.event = LEGION_EVENT_B
++};
++static const struct legion_wmi_private legion_wmi_context_c = {
++	.event = LEGION_EVENT_C
++};
++static const struct legion_wmi_private legion_wmi_context_d = {
++	.event = LEGION_EVENT_D
++};
++static const struct legion_wmi_private legion_wmi_context_e = {
++	.event = LEGION_EVENT_E
++};
++static const struct legion_wmi_private legion_wmi_context_f = {
++	.event = LEGION_EVENT_F
++};
++
++// check if really a method
++#define LEGION_WMI_GAMEZONE_GUID "887B54E3-DDDC-4B2C-8B88-68A26A8835D0"
++
++#define LEGION_WMI_GUID_FAN_EVENT "D320289E-8FEA-41E0-86F9-611D83151B5F"
++#define LEGION_WMI_GUID_FAN2_EVENT "bc72a435-e8c1-4275-b3e2-d8b8074aba59"
++#define LEGION_WMI_GUID_GAMEZONE_KEY_EVENT \
++	"10afc6d9-ea8b-4590-a2e7-1cd3c84bb4b1"
++#define LEGION_WMI_GUID_GAMEZONE_GPU_EVENT \
++	"bfd42481-aee3-4502-a107-afb68425c5f8"
++#define LEGION_WMI_GUID_GAMEZONE_OC_EVENT "d062906b-12d4-4510-999d-4831ee80e985"
++#define LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT \
++	"bfd42481-aee3-4501-a107-afb68425c5f8"
++//#define LEGION_WMI_GUID_GAMEZONE_DATA_EVENT  "887b54e3-dddc-4b2c-8b88-68a26a8835d0"
++
++static const struct wmi_device_id legion_wmi_ids[] = {
++	{ LEGION_WMI_GAMEZONE_GUID, &legion_wmi_context_gamezone },
++	{ LEGION_WMI_GUID_FAN_EVENT, &legion_wmi_context_a },
++	{ LEGION_WMI_GUID_FAN2_EVENT, &legion_wmi_context_b },
++	{ LEGION_WMI_GUID_GAMEZONE_KEY_EVENT, &legion_wmi_context_c },
++	{ LEGION_WMI_GUID_GAMEZONE_GPU_EVENT, &legion_wmi_context_d },
++	{ LEGION_WMI_GUID_GAMEZONE_OC_EVENT, &legion_wmi_context_e },
++	{ LEGION_WMI_GUID_GAMEZONE_TEMP_EVENT, &legion_wmi_context_f },
++	{ "8FC0DE0C-B4E4-43FD-B0F3-8871711C1294",
++	  &legion_wmi_context_gamezone }, /* Legion 5 */
++	{},
++};
++MODULE_DEVICE_TABLE(wmi, legion_wmi_ids);
++
++static struct wmi_driver legion_wmi_driver = {
++	.driver = {
++		.name = "legion_wmi",
++	},
++	.id_table = legion_wmi_ids,
++	.probe = legion_wmi_probe,
++	.notify = legion_wmi_notify,
++};
++
++//acpi_status status = wmi_install_notify_handler(LEGION_WMI_GAMEZONE_GUID,
++//				legion_wmi_notify2, NULL);
++//if (ACPI_FAILURE(status)) {
++//    return -ENODEV;
++//}
++//return 0;
++
++static int legion_wmi_init(void)
++{
++	return wmi_driver_register(&legion_wmi_driver);
++}
++
++static void legion_wmi_exit(void)
++{
++	// TODO: remove this
++	pr_info("Unloading legion WMI\n");
++
++	//wmi_remove_notify_handler(LEGION_WMI_GAMEZONE_GUID);
++	wmi_driver_unregister(&legion_wmi_driver);
++	pr_info("Unloading legion WMI done\n");
++}
++
++/* =============================  */
++/* Platform profile               */
++/* ============================   */
++
++enum LEGION_POWERMODE {
++	LEGION_POWERMODE_BALANCED = 0,
++	LEGION_POWERMODE_PERFORMANCE = 1,
++	LEGION_POWERMODE_QUIET = 2,
++};
++
++static int legion_platform_profile_get(struct platform_profile_handler *pprof,
++				       enum platform_profile_option *profile)
++{
++	int powermode;
++	struct legion_private *priv;
++
++	priv = container_of(pprof, struct legion_private,
++			    platform_profile_handler);
++	powermode = read_powermode(&priv->ecram, priv->conf);
++
++	switch (powermode) {
++	case LEGION_POWERMODE_BALANCED:
++		*profile = PLATFORM_PROFILE_BALANCED;
++		break;
++	case LEGION_POWERMODE_PERFORMANCE:
++		*profile = PLATFORM_PROFILE_PERFORMANCE;
++		break;
++	case LEGION_POWERMODE_QUIET:
++		*profile = PLATFORM_PROFILE_QUIET;
++		break;
++	default:
++		return -EINVAL;
++	}
++	return 0;
++}
++
++static int legion_platform_profile_set(struct platform_profile_handler *pprof,
++				       enum platform_profile_option profile)
++{
++	int powermode;
++	struct legion_private *priv;
++
++	priv = container_of(pprof, struct legion_private,
++			    platform_profile_handler);
++
++	switch (profile) {
++	case PLATFORM_PROFILE_BALANCED:
++		powermode = LEGION_POWERMODE_BALANCED;
++		break;
++	case PLATFORM_PROFILE_PERFORMANCE:
++		powermode = LEGION_POWERMODE_PERFORMANCE;
++		break;
++	case PLATFORM_PROFILE_QUIET:
++		powermode = LEGION_POWERMODE_QUIET;
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return write_powermode(&priv->ecram, priv->conf, powermode);
++}
++
++static int legion_platform_profile_init(struct legion_private *priv)
++{
++	int err;
++
++	priv->platform_profile_handler.profile_get =
++		legion_platform_profile_get;
++	priv->platform_profile_handler.profile_set =
++		legion_platform_profile_set;
++
++	set_bit(PLATFORM_PROFILE_QUIET, priv->platform_profile_handler.choices);
++	set_bit(PLATFORM_PROFILE_BALANCED,
++		priv->platform_profile_handler.choices);
++	set_bit(PLATFORM_PROFILE_PERFORMANCE,
++		priv->platform_profile_handler.choices);
++
++	err = platform_profile_register(&priv->platform_profile_handler);
++	if (err)
++		return err;
++
++	return 0;
++}
++
++static void legion_platform_profile_exit(struct legion_private *priv)
++{
++	pr_info("Unloading legion platform profile\n");
++	platform_profile_remove();
++	pr_info("Unloading legion platform profile done\n");
++}
++
++/* =============================  */
++/* hwom interface              */
++/* ============================   */
++
++// hw-mon interface
++
++// todo: register_group or register_info?
++
++// TODO: use one common function (like here) or one function per attribute?
++static ssize_t sensor_label_show(struct device *dev,
++				 struct device_attribute *attr, char *buf)
++{
++	int sensor_id = (to_sensor_dev_attr(attr))->index;
++	const char *label;
++
++	switch (sensor_id) {
++	case SENSOR_CPU_TEMP_ID:
++		label = "CPU Temperature\n";
++		break;
++	case SENSOR_GPU_TEMP_ID:
++		label = "GPU Temperature\n";
++		break;
++	case SENSOR_IC_TEMP_ID:
++		label = "IC Temperature\n";
++		break;
++	case SENSOR_FAN1_RPM_ID:
++		label = "Fan 1\n";
++		break;
++	case SENSOR_FAN2_RPM_ID:
++		label = "Fan 2\n";
++		break;
++	case SENSOR_FAN1_TARGET_RPM_ID:
++		label = "Fan 1 Target\n";
++		break;
++	case SENSOR_FAN2_TARGET_RPM_ID:
++		label = "Fan 2 Target\n";
++		break;
++	default:
++		return -EOPNOTSUPP;
++	}
++
++	return sprintf(buf, label);
++}
++
++// TODO: use one common function (like here) or one function per attribute?
++static ssize_t sensor_show(struct device *dev, struct device_attribute *devattr,
++			   char *buf)
++{
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int sensor_id = (to_sensor_dev_attr(devattr))->index;
++	struct sensor_values values;
++	int outval;
++
++	read_sensor_values(&priv->ecram, priv->conf, &values);
++
++	switch (sensor_id) {
++	case SENSOR_CPU_TEMP_ID:
++		outval = 1000 * values.cpu_temp_celsius;
++		break;
++	case SENSOR_GPU_TEMP_ID:
++		outval = 1000 * values.gpu_temp_celsius;
++		break;
++	case SENSOR_IC_TEMP_ID:
++		outval = 1000 * values.ic_temp_celsius;
++		break;
++	case SENSOR_FAN1_RPM_ID:
++		outval = values.fan1_rpm;
++		break;
++	case SENSOR_FAN2_RPM_ID:
++		outval = values.fan2_rpm;
++		break;
++	case SENSOR_FAN1_TARGET_RPM_ID:
++		outval = values.fan1_target_rpm;
++		break;
++	case SENSOR_FAN2_TARGET_RPM_ID:
++		outval = values.fan2_target_rpm;
++		break;
++	default:
++		pr_info("Error reading sensor value with id %d\n", sensor_id);
++		return -EOPNOTSUPP;
++	}
++
++	return sprintf(buf, "%d\n", outval);
++}
++
++static SENSOR_DEVICE_ATTR_RO(temp1_input, sensor, SENSOR_CPU_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(temp1_label, sensor_label, SENSOR_CPU_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(temp2_input, sensor, SENSOR_GPU_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(temp2_label, sensor_label, SENSOR_GPU_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(temp3_input, sensor, SENSOR_IC_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(temp3_label, sensor_label, SENSOR_IC_TEMP_ID);
++static SENSOR_DEVICE_ATTR_RO(fan1_input, sensor, SENSOR_FAN1_RPM_ID);
++static SENSOR_DEVICE_ATTR_RO(fan1_label, sensor_label, SENSOR_FAN1_RPM_ID);
++static SENSOR_DEVICE_ATTR_RO(fan2_input, sensor, SENSOR_FAN2_RPM_ID);
++static SENSOR_DEVICE_ATTR_RO(fan2_label, sensor_label, SENSOR_FAN2_RPM_ID);
++static SENSOR_DEVICE_ATTR_RO(fan1_target, sensor, SENSOR_FAN1_TARGET_RPM_ID);
++static SENSOR_DEVICE_ATTR_RO(fan2_target, sensor, SENSOR_FAN2_TARGET_RPM_ID);
++
++static struct attribute *sensor_hwmon_attributes[] = {
++	&sensor_dev_attr_temp1_input.dev_attr.attr,
++	&sensor_dev_attr_temp1_label.dev_attr.attr,
++	&sensor_dev_attr_temp2_input.dev_attr.attr,
++	&sensor_dev_attr_temp2_label.dev_attr.attr,
++	&sensor_dev_attr_temp3_input.dev_attr.attr,
++	&sensor_dev_attr_temp3_label.dev_attr.attr,
++	&sensor_dev_attr_fan1_input.dev_attr.attr,
++	&sensor_dev_attr_fan1_label.dev_attr.attr,
++	&sensor_dev_attr_fan2_input.dev_attr.attr,
++	&sensor_dev_attr_fan2_label.dev_attr.attr,
++	&sensor_dev_attr_fan1_target.dev_attr.attr,
++	&sensor_dev_attr_fan2_target.dev_attr.attr,
++	NULL
++};
++
++static ssize_t autopoint_show(struct device *dev,
++			      struct device_attribute *devattr, char *buf)
++{
++	struct fancurve fancurve;
++	int err;
++	int value;
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
++	int point_id = to_sensor_dev_attr_2(devattr)->index;
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
++	mutex_unlock(&priv->fancurve_mutex);
++
++	if (err) {
++		pr_info("Reading fancurve failed\n");
++		return -EOPNOTSUPP;
++	}
++	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
++		pr_info("Reading fancurve failed due to wrong point id: %d\n",
++			point_id);
++		return -EOPNOTSUPP;
++	}
++
++	switch (fancurve_attr_id) {
++	case FANCURVE_ATTR_PWM1:
++		value = fancurve.points[point_id].rpm1_raw * 100;
++		break;
++	case FANCURVE_ATTR_PWM2:
++		value = fancurve.points[point_id].rpm2_raw * 100;
++		break;
++	case FANCURVE_ATTR_CPU_TEMP:
++		value = fancurve.points[point_id].cpu_max_temp_celsius;
++		break;
++	case FANCURVE_ATTR_CPU_HYST:
++		value = fancurve.points[point_id].cpu_min_temp_celsius;
++		break;
++	case FANCURVE_ATTR_GPU_TEMP:
++		value = fancurve.points[point_id].gpu_max_temp_celsius;
++		break;
++	case FANCURVE_ATTR_GPU_HYST:
++		value = fancurve.points[point_id].gpu_min_temp_celsius;
++		break;
++	case FANCURVE_ATTR_IC_TEMP:
++		value = fancurve.points[point_id].ic_max_temp_celsius;
++		break;
++	case FANCURVE_ATTR_IC_HYST:
++		value = fancurve.points[point_id].ic_min_temp_celsius;
++		break;
++	case FANCURVE_ATTR_ACCEL:
++		value = fancurve.points[point_id].accel;
++		break;
++	case FANCURVE_ATTR_DECEL:
++		value = fancurve.points[point_id].decel;
++		break;
++	case FANCURVE_SIZE:
++		value = fancurve.size;
++		break;
++	default:
++		pr_info("Reading fancurve failed due to wrong attribute id: %d\n",
++			fancurve_attr_id);
++		return -EOPNOTSUPP;
++	}
++
++	return sprintf(buf, "%d\n", value);
++}
++
++static ssize_t autopoint_store(struct device *dev,
++			       struct device_attribute *devattr,
++			       const char *buf, size_t count)
++{
++	struct fancurve fancurve;
++	int err;
++	int value;
++	bool valid;
++	struct legion_private *priv = dev_get_drvdata(dev);
++	int fancurve_attr_id = to_sensor_dev_attr_2(devattr)->nr;
++	int point_id = to_sensor_dev_attr_2(devattr)->index;
++
++	if (!(point_id >= 0 && point_id < MAXFANCURVESIZE)) {
++		pr_info("Reading fancurve failed due to wrong point id: %d\n",
++			point_id);
++		err = -EOPNOTSUPP;
++		goto error;
++	}
++
++	err = kstrtoint(buf, 0, &value);
++	if (err) {
++		pr_info("Parse for hwmon store is not succesful: error:%d; point_id: %d; fancurve_attr_id: %d\\n",
++			err, point_id, fancurve_attr_id);
++		goto error;
++	}
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = read_fancurve(&priv->ecram, priv->conf, &fancurve);
++
++	if (err) {
++		pr_info("Reading fancurve failed\n");
++		err = -EOPNOTSUPP;
++		goto error_mutex;
++	}
++
++	switch (fancurve_attr_id) {
++	case FANCURVE_ATTR_PWM1:
++		valid = fancurve_set_rpm1(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_PWM2:
++		valid = fancurve_set_rpm2(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_CPU_TEMP:
++		valid = fancurve_set_cpu_temp_max(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_CPU_HYST:
++		valid = fancurve_set_cpu_temp_min(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_GPU_TEMP:
++		valid = fancurve_set_gpu_temp_max(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_GPU_HYST:
++		valid = fancurve_set_gpu_temp_min(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_IC_TEMP:
++		valid = fancurve_set_ic_temp_max(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_IC_HYST:
++		valid = fancurve_set_ic_temp_min(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_ACCEL:
++		valid = fancurve_set_accel(&fancurve, point_id, value);
++		break;
++	case FANCURVE_ATTR_DECEL:
++		valid = fancurve_set_decel(&fancurve, point_id, value);
++		break;
++	case FANCURVE_SIZE:
++		valid = fancurve_set_size(&fancurve, value, true);
++		break;
++	default:
++		pr_info("Writing fancurve failed due to wrong attribute id: %d\n",
++			fancurve_attr_id);
++		err = -EOPNOTSUPP;
++		goto error_mutex;
++	}
++
++	if (!valid) {
++		pr_info("Ignoring invalid fancurve value %d for attribute %d at point %d\n",
++			value, fancurve_attr_id, point_id);
++		err = -EOPNOTSUPP;
++		goto error_mutex;
++	}
++
++	err = write_fancurve(&priv->ecram, priv->conf, &fancurve, false);
++	if (err) {
++		pr_info("Writing fancurve failed for accessing hwmon at point_id: %d\n",
++			point_id);
++		err = -EOPNOTSUPP;
++		goto error_mutex;
++	}
++
++	mutex_unlock(&priv->fancurve_mutex);
++	return count;
++
++error_mutex:
++	mutex_unlock(&priv->fancurve_mutex);
++error:
++	return count;
++}
++
++// rpm1
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_pwm, autopoint,
++			       FANCURVE_ATTR_PWM1, 9);
++// rpm2
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_pwm, autopoint,
++			       FANCURVE_ATTR_PWM2, 9);
++// CPU temp
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp, autopoint,
++			       FANCURVE_ATTR_CPU_TEMP, 9);
++// CPU temp hyst
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_temp_hyst, autopoint,
++			       FANCURVE_ATTR_CPU_HYST, 9);
++// GPU temp
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp, autopoint,
++			       FANCURVE_ATTR_GPU_TEMP, 9);
++// GPU temp hyst
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point1_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point2_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point3_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point4_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point5_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point6_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point7_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point8_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point9_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm2_auto_point10_temp_hyst, autopoint,
++			       FANCURVE_ATTR_GPU_HYST, 9);
++// IC temp
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp, autopoint,
++			       FANCURVE_ATTR_IC_TEMP, 9);
++// IC temp hyst
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point1_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point2_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point3_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point4_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point5_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point6_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point7_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point8_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point9_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm3_auto_point10_temp_hyst, autopoint,
++			       FANCURVE_ATTR_IC_HYST, 9);
++// accel
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_accel, autopoint,
++			       FANCURVE_ATTR_ACCEL, 9);
++// decel
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point1_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 0);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point2_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 1);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point3_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 2);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point4_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 3);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point5_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 4);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point6_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 5);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point7_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 6);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point8_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 7);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point9_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 8);
++static SENSOR_DEVICE_ATTR_2_RW(pwm1_auto_point10_decel, autopoint,
++			       FANCURVE_ATTR_DECEL, 9);
++//size
++static SENSOR_DEVICE_ATTR_2_RW(auto_points_size, autopoint, FANCURVE_SIZE, 0);
++
++static ssize_t minifancurve_show(struct device *dev,
++				 struct device_attribute *devattr, char *buf)
++{
++	bool value;
++	int err;
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = read_minifancurve(&priv->ecram, priv->conf, &value);
++	if (err) {
++		err = -1;
++		pr_info("Reading minifancurve not succesful\n");
++		goto error_unlock;
++	}
++	mutex_unlock(&priv->fancurve_mutex);
++	return sprintf(buf, "%d\n", value);
++
++error_unlock:
++	mutex_unlock(&priv->fancurve_mutex);
++	return -1;
++}
++
++static ssize_t minifancurve_store(struct device *dev,
++				  struct device_attribute *devattr,
++				  const char *buf, size_t count)
++{
++	int value;
++	int err;
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	err = kstrtoint(buf, 0, &value);
++	if (err) {
++		err = -1;
++		pr_info("Parse for hwmon store is not succesful: error:%d\n",
++			err);
++		goto error;
++	}
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = write_minifancurve(&priv->ecram, priv->conf, value);
++	if (err) {
++		err = -1;
++		pr_info("Writing minifancurve not succesful\n");
++		goto error_unlock;
++	}
++	mutex_unlock(&priv->fancurve_mutex);
++	return count;
++
++error_unlock:
++	mutex_unlock(&priv->fancurve_mutex);
++error:
++	return err;
++}
++
++static SENSOR_DEVICE_ATTR_RW(minifancurve, minifancurve, 0);
++
++static ssize_t pwm1_mode_show(struct device *dev,
++			      struct device_attribute *devattr, char *buf)
++{
++	bool value;
++	int err;
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = read_maximumfanspeed(&priv->ecram, priv->conf, &value);
++	if (err) {
++		err = -1;
++		pr_info("Reading pwm1_mode/maximumfanspeed not succesful\n");
++		goto error_unlock;
++	}
++	mutex_unlock(&priv->fancurve_mutex);
++	return sprintf(buf, "%d\n", value ? 0 : 2);
++
++error_unlock:
++	mutex_unlock(&priv->fancurve_mutex);
++	return -1;
++}
++
++static ssize_t pwm1_mode_store(struct device *dev,
++			       struct device_attribute *devattr,
++			       const char *buf, size_t count)
++{
++	int value;
++	int is_maximumfanspeed;
++	int err;
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	err = kstrtoint(buf, 0, &value);
++	if (err) {
++		err = -1;
++		pr_info("Parse for hwmon store is not succesful: error:%d\n",
++			err);
++		goto error;
++	}
++	is_maximumfanspeed = value == 0;
++
++	mutex_lock(&priv->fancurve_mutex);
++	err = write_maximumfanspeed(&priv->ecram, priv->conf,
++				    is_maximumfanspeed);
++	if (err) {
++		err = -1;
++		pr_info("Writing pwm1_mode/maximumfanspeed not succesful\n");
++		goto error_unlock;
++	}
++	mutex_unlock(&priv->fancurve_mutex);
++	return count;
++
++error_unlock:
++	mutex_unlock(&priv->fancurve_mutex);
++error:
++	return err;
++}
++
++static SENSOR_DEVICE_ATTR_RW(pwm1_mode, pwm1_mode, 0);
++
++static struct attribute *fancurve_hwmon_attributes[] = {
++	&sensor_dev_attr_pwm1_auto_point1_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point2_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point3_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point4_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point5_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point6_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point7_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point8_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point9_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point10_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point1_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point2_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point3_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point4_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point5_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point6_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point7_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point8_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point9_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point10_pwm.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point1_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point2_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point3_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point4_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point5_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point6_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point7_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point8_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point9_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point10_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point1_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point2_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point3_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point4_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point5_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point6_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point7_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point8_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point9_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point10_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point1_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point2_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point3_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point4_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point5_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point6_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point7_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point8_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point9_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point10_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point1_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point2_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point3_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point4_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point5_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point6_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point7_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point8_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point9_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm2_auto_point10_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point1_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point2_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point3_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point4_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point5_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point6_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point7_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point8_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point9_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point10_temp.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point1_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point2_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point3_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point4_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point5_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point6_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point7_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point8_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point9_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm3_auto_point10_temp_hyst.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point1_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point2_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point3_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point4_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point5_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point6_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point7_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point8_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point9_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point10_accel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point1_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point2_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point3_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point4_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point5_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point6_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point7_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point8_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point9_decel.dev_attr.attr,
++	&sensor_dev_attr_pwm1_auto_point10_decel.dev_attr.attr,
++	//
++	&sensor_dev_attr_auto_points_size.dev_attr.attr,
++	&sensor_dev_attr_minifancurve.dev_attr.attr,
++	&sensor_dev_attr_pwm1_mode.dev_attr.attr, NULL
++};
++
++static umode_t legion_is_visible(struct kobject *kobj, struct attribute *attr,
++				 int idx)
++{
++	bool supported = true;
++	struct device *dev = kobj_to_dev(kobj);
++	struct legion_private *priv = dev_get_drvdata(dev);
++
++	if (attr == &sensor_dev_attr_minifancurve.dev_attr.attr)
++		supported = priv->conf->has_minifancurve;
++
++	return supported ? attr->mode : 0;
++}
++
++static const struct attribute_group legion_hwmon_sensor_group = {
++	.attrs = sensor_hwmon_attributes,
++	.is_visible = NULL
++};
++
++static const struct attribute_group legion_hwmon_fancurve_group = {
++	.attrs = fancurve_hwmon_attributes,
++	.is_visible = legion_is_visible,
++};
++
++static const struct attribute_group *legion_hwmon_groups[] = {
++	&legion_hwmon_sensor_group, &legion_hwmon_fancurve_group, NULL
++};
++
++ssize_t legion_hwmon_init(struct legion_private *priv)
++{
++	//TODO: use hwmon_device_register_with_groups or
++	// hwmon_device_register_with_info (latter means all hwmon functions have to be
++	// changed)
++	// some laptop driver do it in one way, some in the other
++	// TODO: Use devm_hwmon_device_register_with_groups ?
++	// some laptop drivers use this, some
++	struct device *hwmon_dev = hwmon_device_register_with_groups(
++		&priv->platform_device->dev, "legion_hwmon", priv,
++		legion_hwmon_groups);
++	if (IS_ERR_OR_NULL(hwmon_dev)) {
++		pr_err("hwmon_device_register failed!\n");
++		return PTR_ERR(hwmon_dev);
++	}
++	dev_set_drvdata(hwmon_dev, priv);
++	priv->hwmon_dev = hwmon_dev;
++	return 0;
++}
++
++void legion_hwmon_exit(struct legion_private *priv)
++{
++	pr_info("Unloading legion hwon\n");
++	if (priv->hwmon_dev) {
++		hwmon_device_unregister(priv->hwmon_dev);
++		priv->hwmon_dev = NULL;
++	}
++	pr_info("Unloading legion hwon done\n");
++}
++
++/* =============================  */
++/* Platform driver                */
++/* ============================   */
++
++int legion_add(struct platform_device *pdev)
++{
++	struct legion_private *priv;
++	const struct dmi_system_id *dmi_sys;
++	int err;
++	u16 ec_read_id;
++	bool is_denied = true;
++	bool is_allowed = false;
++	bool do_load_by_list = false;
++	bool do_load = false;
++	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
++	dev_info(&pdev->dev, "legion_laptop platform driver probing\n");
++
++	dev_info(&pdev->dev, "Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
++		dmi_get_system_info(DMI_SYS_VENDOR),
++		dmi_get_system_info(DMI_PRODUCT_NAME),
++		dmi_get_system_info(DMI_BIOS_VERSION));
++
++	// TODO: allocate?
++	priv = &_priv;
++	priv->platform_device = pdev;
++	err = legion_shared_init(priv);
++	if (err) {
++		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
++		goto err_legion_shared_init;
++	}
++	dev_set_drvdata(&pdev->dev, priv);
++
++	// TODO: remove
++	pr_info("Read identifying information: DMI_SYS_VENDOR: %s; DMI_PRODUCT_NAME: %s; DMI_BIOS_VERSION:%s\n",
++		dmi_get_system_info(DMI_SYS_VENDOR),
++		dmi_get_system_info(DMI_PRODUCT_NAME),
++		dmi_get_system_info(DMI_BIOS_VERSION));
++
++	dmi_sys = dmi_first_match(optimistic_allowlist);
++	is_allowed = dmi_sys != NULL;
++	is_denied = dmi_check_system(denylist);
++	do_load_by_list = is_allowed && !is_denied;
++	do_load = do_load_by_list || force;
++
++	dev_info(
++		&pdev->dev,
++		"is_denied: %d; is_allowed: %d; do_load_by_list: %d; do_load: %d\n",
++		is_denied, is_allowed, do_load_by_list, do_load);
++
++	if (!(do_load)) {
++		dev_info(
++			&pdev->dev,
++			"Module not useable for this laptop because it is not in allowlist. Notify maintainer if you want to add your device or force load with param force.\n");
++		err = -ENOMEM;
++		goto err_model_mismtach;
++	}
++
++	if (force)
++		dev_info(&pdev->dev, "legion_laptop is forced to load.\n");
++
++	if (!do_load_by_list && do_load) {
++		dev_info(
++			&pdev->dev,
++			"legion_laptop is forced to load and would otherwise be not loaded\n");
++	}
++
++	// if forced and no module found, use config for first model
++	if (dmi_sys == NULL)
++		dmi_sys = &optimistic_allowlist[0];
++	dev_info(&pdev->dev, "Using configuration for system: %s\n",
++		 dmi_sys->ident);
++
++	priv->conf = dmi_sys->driver_data;
++
++	err = ecram_init(&priv->ecram, priv->conf->memoryio_physical_ec_start,
++			 priv->conf->memoryio_size);
++	if (err) {
++		dev_info(&pdev->dev,
++			 "Could not init access to embedded controller\n");
++		goto err_ecram_init;
++	}
++
++	ec_read_id = read_ec_id(&priv->ecram, priv->conf);
++	dev_info(&pdev->dev, "Read embedded controller ID 0x%x\n", ec_read_id);
++	if (priv->conf->check_embedded_controller_id &&
++	    !(ec_read_id == priv->conf->embedded_controller_id)) {
++		err = -ENOMEM;
++		dev_info(&pdev->dev, "Expected EC chip id 0x%x but read 0x%x\n",
++			 priv->conf->embedded_controller_id, ec_read_id);
++		goto err_ecram_id;
++	}
++	if (!priv->conf->check_embedded_controller_id) {
++		dev_info(&pdev->dev,
++			 "Skipped checking embedded controller id\n");
++	}
++
++	dev_info(&pdev->dev, "Creating debugfs inteface\n");
++	legion_debugfs_init(priv);
++
++	pr_info("Creating sysfs inteface\n");
++	err = legion_sysfs_init(priv);
++	if (err) {
++		dev_info(&pdev->dev, "Creating sysfs interface failed\n");
++		goto err_sysfs_init;
++	}
++
++	pr_info("Creating hwmon interface");
++	err = legion_hwmon_init(priv);
++	if (err)
++		goto err_hwmon_init;
++
++	pr_info("Creating platform profile support\n");
++	err = legion_platform_profile_init(priv);
++	if (err) {
++		dev_info(&pdev->dev, "Creating platform profile failed\n");
++		goto err_platform_profile;
++	}
++
++	pr_info("Init WMI driver support\n");
++	err = legion_wmi_init();
++	if (err) {
++		dev_info(&pdev->dev, "Init WMI driver failed\n");
++		goto err_wmi;
++	}
++
++	dev_info(&pdev->dev, "legion_laptop loaded for this device\n");
++	return 0;
++
++	// TODO: remove eventually
++	legion_wmi_exit();
++err_wmi:
++	legion_platform_profile_exit(priv);
++err_platform_profile:
++	legion_hwmon_exit(priv);
++err_hwmon_init:
++	legion_sysfs_exit(priv);
++err_sysfs_init:
++	legion_debugfs_exit(priv);
++err_ecram_id:
++	ecram_exit(&priv->ecram);
++err_ecram_init:
++	legion_shared_exit(priv);
++err_legion_shared_init:
++err_model_mismtach:
++	dev_info(&pdev->dev, "legion_laptop not loaded for this device\n");
++	return err;
++}
++
++int legion_remove(struct platform_device *pdev)
++{
++	struct legion_private *priv = dev_get_drvdata(&pdev->dev);
++
++	mutex_lock(&legion_shared_mutex);
++	priv->loaded = false;
++	mutex_unlock(&legion_shared_mutex);
++
++	// first unregister wmi, so toggling powermode does not
++	// generate events anymore that even might be delayed
++	legion_wmi_exit();
++	legion_platform_profile_exit(priv);
++
++	// toggle power mode to load default setting from embedded controller
++	// again
++	toggle_powermode(&priv->ecram, priv->conf);
++
++	legion_hwmon_exit(priv);
++	legion_sysfs_exit(priv);
++	legion_debugfs_exit(priv);
++	ecram_exit(&priv->ecram);
++	legion_shared_exit(priv);
++
++	pr_info("Legion platform unloaded\n");
++	return 0;
++}
++
++int legion_resume(struct platform_device *pdev)
++{
++	//struct legion_private *priv = dev_get_drvdata(&pdev->dev);
++	dev_info(&pdev->dev, "Resumed in legion-laptop\n");
++
++	return 0;
++}
++
++#ifdef CONFIG_PM_SLEEP
++static int legion_pm_resume(struct device *dev)
++{
++	//struct legion_private *priv = dev_get_drvdata(dev);
++	dev_info(dev, "Resumed PM in legion-laptop\n");
++
++	return 0;
++}
++#endif
++static SIMPLE_DEV_PM_OPS(legion_pm, NULL, legion_pm_resume);
++
++// same as ideapad
++static const struct acpi_device_id legion_device_ids[] = {
++	{ "PNP0C09", 0 }, // todo: change to "VPC2004"
++	{ "", 0 },
++};
++MODULE_DEVICE_TABLE(acpi, legion_device_ids);
++
++static struct platform_driver legion_driver = {
++	.probe = legion_add,
++	.remove = legion_remove,
++	.resume = legion_resume,
++	.driver = {
++		.name   = "legion",
++		.pm     = &legion_pm,
++		.acpi_match_table = ACPI_PTR(legion_device_ids),
++	},
++};
++
++int __init legion_init(void)
++{
++	int err;
++
++	pr_info("legion_laptop starts loading\n");
++	err = platform_driver_register(&legion_driver);
++	if (err) {
++		pr_info("legion_laptop: platform_driver_register failed\n");
++		return err;
++	}
++
++	return 0;
++}
++
++module_init(legion_init);
++
++void __exit legion_exit(void)
++{
++	platform_driver_unregister(&legion_driver);
++	pr_info("legion_laptop exit\n");
++}
++
++module_exit(legion_exit);
 diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c
 new file mode 100644
 index 000000000000..77a6677ec19e
@@ -7079,10 +9895,10 @@ index ab0c5bd1a60f..f4989f706d7f 100644
 -- 
 2.40.1
 
-From a6fac309dae53f34208de29f5b82d053ca55eed6 Mon Sep 17 00:00:00 2001
+From 9e165ac849652399c952c5e1764ca9a7630a28c7 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Wed, 26 Apr 2023 22:04:18 +0200
-Subject: [PATCH 4/8] fixes
+Date: Tue, 25 Apr 2023 17:17:39 +0200
+Subject: [PATCH 04/10] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -9206,10 +12022,12146 @@ index 75020edd39e7..e4455220e9fd 100644
 -- 
 2.40.1
 
-From 75780f643d87d4f249b25a14bcc99b767209fa2b Mon Sep 17 00:00:00 2001
+From d3a7d6477e59e6015a1e50ac35a341c4aa4c7324 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 28 Apr 2023 19:59:05 +0200
+Subject: [PATCH 05/10] fs-patches
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ block/Kconfig                     |    3 +
+ block/blk-cgroup.c                |   78 +-
+ block/blk-cgroup.h                |   15 +-
+ block/blk-core.c                  |    3 -
+ fs/btrfs/Kconfig                  |    1 +
+ fs/btrfs/bio.c                    |  211 +-
+ fs/btrfs/bio.h                    |   22 +-
+ fs/btrfs/block-group.c            |   40 +-
+ fs/btrfs/block-group.h            |   13 +-
+ fs/btrfs/block-rsv.c              |   21 +-
+ fs/btrfs/block-rsv.h              |    2 +-
+ fs/btrfs/btrfs_inode.h            |   35 +-
+ fs/btrfs/compression.c            |  299 +--
+ fs/btrfs/compression.h            |   20 +-
+ fs/btrfs/ctree.c                  |   91 +-
+ fs/btrfs/ctree.h                  |   17 +-
+ fs/btrfs/delalloc-space.c         |    2 +-
+ fs/btrfs/delayed-ref.c            |   49 +-
+ fs/btrfs/delayed-ref.h            |   22 +-
+ fs/btrfs/disk-io.c                |  147 +-
+ fs/btrfs/extent-tree.c            |   37 +-
+ fs/btrfs/extent_io.c              |  550 ++--
+ fs/btrfs/file-item.c              |   93 +-
+ fs/btrfs/file-item.h              |    3 +-
+ fs/btrfs/fs.h                     |   53 +-
+ fs/btrfs/inode-item.c             |   15 +-
+ fs/btrfs/inode.c                  |  375 ++-
+ fs/btrfs/ioctl.c                  |    5 +
+ fs/btrfs/locking.c                |   25 +-
+ fs/btrfs/locking.h                |    5 +-
+ fs/btrfs/lru_cache.h              |    5 -
+ fs/btrfs/lzo.c                    |   17 +-
+ fs/btrfs/messages.c               |    2 +-
+ fs/btrfs/messages.h               |    2 +-
+ fs/btrfs/ordered-data.c           |  120 +-
+ fs/btrfs/ordered-data.h           |   10 +-
+ fs/btrfs/raid56.c                 |  162 +-
+ fs/btrfs/raid56.h                 |   12 +-
+ fs/btrfs/relocation.c             |    6 +-
+ fs/btrfs/scrub.c                  | 4142 +++++++++--------------------
+ fs/btrfs/send.c                   |    2 +-
+ fs/btrfs/space-info.c             |   32 +-
+ fs/btrfs/space-info.h             |    1 +
+ fs/btrfs/super.c                  |    3 +-
+ fs/btrfs/sysfs.c                  |    5 +
+ fs/btrfs/tests/extent-map-tests.c |    1 -
+ fs/btrfs/transaction.c            |   28 +-
+ fs/btrfs/tree-checker.c           |   14 +
+ fs/btrfs/tree-log.c               |  171 +-
+ fs/btrfs/volumes.c                |  593 ++---
+ fs/btrfs/volumes.h                |   85 +-
+ fs/btrfs/zlib.c                   |    2 -
+ fs/btrfs/zoned.c                  |    4 +-
+ fs/btrfs/zstd.c                   |    1 -
+ include/linux/bio.h               |    5 +
+ include/linux/blk_types.h         |   18 +-
+ include/linux/crc32c.h            |    1 -
+ include/linux/writeback.h         |    5 -
+ include/trace/events/btrfs.h      |    2 +-
+ include/uapi/linux/btrfs.h        |    1 +
+ lib/libcrc32c.c                   |    6 -
+ tools/objtool/check.c             |    1 +
+ 62 files changed, 2867 insertions(+), 4844 deletions(-)
+
+diff --git a/block/Kconfig b/block/Kconfig
+index 941b2dca70db..69ccf7457ae1 100644
+--- a/block/Kconfig
++++ b/block/Kconfig
+@@ -41,6 +41,9 @@ config BLK_RQ_ALLOC_TIME
+ config BLK_CGROUP_RWSTAT
+ 	bool
+ 
++config BLK_CGROUP_PUNT_BIO
++	bool
++
+ config BLK_DEV_BSG_COMMON
+ 	tristate
+ 
+diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
+index bd50b55bdb61..18c922579719 100644
+--- a/block/blk-cgroup.c
++++ b/block/blk-cgroup.c
+@@ -56,7 +56,6 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
+ static LIST_HEAD(all_blkcgs);		/* protected by blkcg_pol_mutex */
+ 
+ bool blkcg_debug_stats = false;
+-static struct workqueue_struct *blkcg_punt_bio_wq;
+ 
+ #define BLKG_DESTROY_BATCH_SIZE  64
+ 
+@@ -166,7 +165,9 @@ static void __blkg_release(struct rcu_head *rcu)
+ {
+ 	struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ 
++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+ 	WARN_ON(!bio_list_empty(&blkg->async_bios));
++#endif
+ 
+ 	/* release the blkcg and parent blkg refs this blkg has been holding */
+ 	css_put(&blkg->blkcg->css);
+@@ -188,6 +189,9 @@ static void blkg_release(struct percpu_ref *ref)
+ 	call_rcu(&blkg->rcu_head, __blkg_release);
+ }
+ 
++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
++static struct workqueue_struct *blkcg_punt_bio_wq;
++
+ static void blkg_async_bio_workfn(struct work_struct *work)
+ {
+ 	struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+@@ -198,10 +202,10 @@ static void blkg_async_bio_workfn(struct work_struct *work)
+ 	bool need_plug = false;
+ 
+ 	/* as long as there are pending bios, @blkg can't go away */
+-	spin_lock_bh(&blkg->async_bio_lock);
++	spin_lock(&blkg->async_bio_lock);
+ 	bio_list_merge(&bios, &blkg->async_bios);
+ 	bio_list_init(&blkg->async_bios);
+-	spin_unlock_bh(&blkg->async_bio_lock);
++	spin_unlock(&blkg->async_bio_lock);
+ 
+ 	/* start plug only when bio_list contains at least 2 bios */
+ 	if (bios.head && bios.head->bi_next) {
+@@ -214,6 +218,40 @@ static void blkg_async_bio_workfn(struct work_struct *work)
+ 		blk_finish_plug(&plug);
+ }
+ 
++/*
++ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
++ * lead to priority inversions as the kthread can be trapped waiting for that
++ * cgroup.  Use this helper instead of submit_bio to punt the actual issuing to
++ * a dedicated per-blkcg work item to avoid such priority inversions.
++ */
++void blkcg_punt_bio_submit(struct bio *bio)
++{
++	struct blkcg_gq *blkg = bio->bi_blkg;
++
++	if (blkg->parent) {
++		spin_lock(&blkg->async_bio_lock);
++		bio_list_add(&blkg->async_bios, bio);
++		spin_unlock(&blkg->async_bio_lock);
++		queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
++	} else {
++		/* never bounce for the root cgroup */
++		submit_bio(bio);
++	}
++}
++EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
++
++static int __init blkcg_punt_bio_init(void)
++{
++	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
++					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
++					    WQ_UNBOUND | WQ_SYSFS, 0);
++	if (!blkcg_punt_bio_wq)
++		return -ENOMEM;
++	return 0;
++}
++subsys_initcall(blkcg_punt_bio_init);
++#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */
++
+ /**
+  * bio_blkcg_css - return the blkcg CSS associated with a bio
+  * @bio: target bio
+@@ -269,10 +307,12 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
+ 
+ 	blkg->q = disk->queue;
+ 	INIT_LIST_HEAD(&blkg->q_node);
++	blkg->blkcg = blkcg;
++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+ 	spin_lock_init(&blkg->async_bio_lock);
+ 	bio_list_init(&blkg->async_bios);
+ 	INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
+-	blkg->blkcg = blkcg;
++#endif
+ 
+ 	u64_stats_init(&blkg->iostat.sync);
+ 	for_each_possible_cpu(cpu) {
+@@ -1688,25 +1728,6 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
+ }
+ EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+ 
+-bool __blkcg_punt_bio_submit(struct bio *bio)
+-{
+-	struct blkcg_gq *blkg = bio->bi_blkg;
+-
+-	/* consume the flag first */
+-	bio->bi_opf &= ~REQ_CGROUP_PUNT;
+-
+-	/* never bounce for the root cgroup */
+-	if (!blkg->parent)
+-		return false;
+-
+-	spin_lock_bh(&blkg->async_bio_lock);
+-	bio_list_add(&blkg->async_bios, bio);
+-	spin_unlock_bh(&blkg->async_bio_lock);
+-
+-	queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+-	return true;
+-}
+-
+ /*
+  * Scale the accumulated delay based on how long it has been since we updated
+  * the delay.  We only call this when we are adding delay, in case it's been a
+@@ -2085,16 +2106,5 @@ bool blk_cgroup_congested(void)
+ 	return ret;
+ }
+ 
+-static int __init blkcg_init(void)
+-{
+-	blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+-					    WQ_MEM_RECLAIM | WQ_FREEZABLE |
+-					    WQ_UNBOUND | WQ_SYSFS, 0);
+-	if (!blkcg_punt_bio_wq)
+-		return -ENOMEM;
+-	return 0;
+-}
+-subsys_initcall(blkcg_init);
+-
+ module_param(blkcg_debug_stats, bool, 0644);
+ MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
+diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
+index 9c5078755e5e..e98d2c1be354 100644
+--- a/block/blk-cgroup.h
++++ b/block/blk-cgroup.h
+@@ -72,9 +72,10 @@ struct blkcg_gq {
+ 	struct blkg_iostat_set		iostat;
+ 
+ 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
+-
++#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+ 	spinlock_t			async_bio_lock;
+ 	struct bio_list			async_bios;
++#endif
+ 	union {
+ 		struct work_struct	async_bio_work;
+ 		struct work_struct	free_work;
+@@ -375,16 +376,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
+ 		if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css),	\
+ 					    (p_blkg)->q)))
+ 
+-bool __blkcg_punt_bio_submit(struct bio *bio);
+-
+-static inline bool blkcg_punt_bio_submit(struct bio *bio)
+-{
+-	if (bio->bi_opf & REQ_CGROUP_PUNT)
+-		return __blkcg_punt_bio_submit(bio);
+-	else
+-		return false;
+-}
+-
+ static inline void blkcg_bio_issue_init(struct bio *bio)
+ {
+ 	bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+@@ -506,8 +497,6 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return
+ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+ static inline void blkg_get(struct blkcg_gq *blkg) { }
+ static inline void blkg_put(struct blkcg_gq *blkg) { }
+-
+-static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
+ static inline void blkcg_bio_issue_init(struct bio *bio) { }
+ static inline void blk_cgroup_bio_start(struct bio *bio) { }
+ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 42926e6cb83c..478978dcb2bd 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -830,9 +830,6 @@ EXPORT_SYMBOL(submit_bio_noacct);
+  */
+ void submit_bio(struct bio *bio)
+ {
+-	if (blkcg_punt_bio_submit(bio))
+-		return;
+-
+ 	if (bio_op(bio) == REQ_OP_READ) {
+ 		task_io_account_read(bio->bi_iter.bi_size);
+ 		count_vm_events(PGPGIN, bio_sectors(bio));
+diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
+index 37b6bab90c83..66fa9ab2c046 100644
+--- a/fs/btrfs/Kconfig
++++ b/fs/btrfs/Kconfig
+@@ -2,6 +2,7 @@
+ 
+ config BTRFS_FS
+ 	tristate "Btrfs filesystem support"
++	select BLK_CGROUP_PUNT_BIO
+ 	select CRYPTO
+ 	select CRYPTO_CRC32C
+ 	select LIBCRC32C
+diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
+index 726592868e9c..5379c4714905 100644
+--- a/fs/btrfs/bio.c
++++ b/fs/btrfs/bio.c
+@@ -31,11 +31,11 @@ struct btrfs_failed_bio {
+  * Initialize a btrfs_bio structure.  This skips the embedded bio itself as it
+  * is already initialized by the block layer.
+  */
+-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+ 		    btrfs_bio_end_io_t end_io, void *private)
+ {
+ 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
+-	bbio->inode = inode;
++	bbio->fs_info = fs_info;
+ 	bbio->end_io = end_io;
+ 	bbio->private = private;
+ 	atomic_set(&bbio->pending_ios, 1);
+@@ -48,41 +48,58 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
+  * Just like the underlying bio_alloc_bioset it will not fail as it is backed by
+  * a mempool.
+  */
+-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+-			    struct btrfs_inode *inode,
+-			    btrfs_bio_end_io_t end_io, void *private)
++struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
++				  struct btrfs_fs_info *fs_info,
++				  btrfs_bio_end_io_t end_io, void *private)
+ {
++	struct btrfs_bio *bbio;
+ 	struct bio *bio;
+ 
+ 	bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
+-	btrfs_bio_init(btrfs_bio(bio), inode, end_io, private);
+-	return bio;
++	bbio = btrfs_bio(bio);
++	btrfs_bio_init(bbio, fs_info, end_io, private);
++	return bbio;
+ }
+ 
+-static struct bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
+-				   struct bio *orig, u64 map_length,
+-				   bool use_append)
++static blk_status_t btrfs_bio_extract_ordered_extent(struct btrfs_bio *bbio)
+ {
+-	struct btrfs_bio *orig_bbio = btrfs_bio(orig);
++	struct btrfs_ordered_extent *ordered;
++	int ret;
++
++	ordered = btrfs_lookup_ordered_extent(bbio->inode, bbio->file_offset);
++	if (WARN_ON_ONCE(!ordered))
++		return BLK_STS_IOERR;
++	ret = btrfs_extract_ordered_extent(bbio, ordered);
++	btrfs_put_ordered_extent(ordered);
++
++	return errno_to_blk_status(ret);
++}
++
++static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info,
++					 struct btrfs_bio *orig_bbio,
++					 u64 map_length, bool use_append)
++{
++	struct btrfs_bio *bbio;
+ 	struct bio *bio;
+ 
+ 	if (use_append) {
+ 		unsigned int nr_segs;
+ 
+-		bio = bio_split_rw(orig, &fs_info->limits, &nr_segs,
++		bio = bio_split_rw(&orig_bbio->bio, &fs_info->limits, &nr_segs,
+ 				   &btrfs_clone_bioset, map_length);
+ 	} else {
+-		bio = bio_split(orig, map_length >> SECTOR_SHIFT, GFP_NOFS,
+-				&btrfs_clone_bioset);
++		bio = bio_split(&orig_bbio->bio, map_length >> SECTOR_SHIFT,
++				GFP_NOFS, &btrfs_clone_bioset);
+ 	}
+-	btrfs_bio_init(btrfs_bio(bio), orig_bbio->inode, NULL, orig_bbio);
+-
+-	btrfs_bio(bio)->file_offset = orig_bbio->file_offset;
+-	if (!(orig->bi_opf & REQ_BTRFS_ONE_ORDERED))
++	bbio = btrfs_bio(bio);
++	btrfs_bio_init(bbio, fs_info, NULL, orig_bbio);
++	bbio->inode = orig_bbio->inode;
++	bbio->file_offset = orig_bbio->file_offset;
++	if (!(orig_bbio->bio.bi_opf & REQ_BTRFS_ONE_ORDERED))
+ 		orig_bbio->file_offset += map_length;
+ 
+ 	atomic_inc(&orig_bbio->pending_ios);
+-	return bio;
++	return bbio;
+ }
+ 
+ static void btrfs_orig_write_end_io(struct bio *bio);
+@@ -164,7 +181,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
+ 			goto done;
+ 		}
+ 
+-		btrfs_submit_bio(&repair_bbio->bio, mirror);
++		btrfs_submit_bio(repair_bbio, mirror);
+ 		return;
+ 	}
+ 
+@@ -224,15 +241,16 @@ static struct btrfs_failed_bio *repair_one_sector(struct btrfs_bio *failed_bbio,
+ 	repair_bio = bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_NOFS,
+ 				      &btrfs_repair_bioset);
+ 	repair_bio->bi_iter.bi_sector = failed_bbio->saved_iter.bi_sector;
+-	bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
++	__bio_add_page(repair_bio, bv->bv_page, bv->bv_len, bv->bv_offset);
+ 
+ 	repair_bbio = btrfs_bio(repair_bio);
+-	btrfs_bio_init(repair_bbio, failed_bbio->inode, NULL, fbio);
++	btrfs_bio_init(repair_bbio, fs_info, NULL, fbio);
++	repair_bbio->inode = failed_bbio->inode;
+ 	repair_bbio->file_offset = failed_bbio->file_offset + bio_offset;
+ 
+ 	mirror = next_repair_mirror(fbio, failed_bbio->mirror_num);
+ 	btrfs_debug(fs_info, "submitting repair read to mirror %d", mirror);
+-	btrfs_submit_bio(repair_bio, mirror);
++	btrfs_submit_bio(repair_bbio, mirror);
+ 	return fbio;
+ }
+ 
+@@ -246,6 +264,9 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de
+ 	struct btrfs_failed_bio *fbio = NULL;
+ 	u32 offset = 0;
+ 
++	/* Read-repair requires the inode field to be set by the submitter. */
++	ASSERT(inode);
++
+ 	/*
+ 	 * Hand off repair bios to the repair code as there is no upper level
+ 	 * submitter for them.
+@@ -306,17 +327,17 @@ static void btrfs_end_bio_work(struct work_struct *work)
+ 	struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
+ 
+ 	/* Metadata reads are checked and repaired by the submitter. */
+-	if (bbio->bio.bi_opf & REQ_META)
+-		bbio->end_io(bbio);
+-	else
++	if (bbio->inode && !(bbio->bio.bi_opf & REQ_META))
+ 		btrfs_check_read_bio(bbio, bbio->bio.bi_private);
++	else
++		bbio->end_io(bbio);
+ }
+ 
+ static void btrfs_simple_end_io(struct bio *bio)
+ {
+ 	struct btrfs_bio *bbio = btrfs_bio(bio);
+ 	struct btrfs_device *dev = bio->bi_private;
+-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++	struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 
+ 	btrfs_bio_counter_dec(fs_info);
+ 
+@@ -340,7 +361,8 @@ static void btrfs_raid56_end_io(struct bio *bio)
+ 
+ 	btrfs_bio_counter_dec(bioc->fs_info);
+ 	bbio->mirror_num = bioc->mirror_num;
+-	if (bio_op(bio) == REQ_OP_READ && !(bbio->bio.bi_opf & REQ_META))
++	if (bio_op(bio) == REQ_OP_READ && bbio->inode &&
++	    !(bbio->bio.bi_opf & REQ_META))
+ 		btrfs_check_read_bio(bbio, NULL);
+ 	else
+ 		btrfs_orig_bbio_end_io(bbio);
+@@ -418,7 +440,11 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
+ 		dev->devid, bio->bi_iter.bi_size);
+ 
+ 	btrfsic_check_bio(bio);
+-	submit_bio(bio);
++
++	if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT)
++		blkcg_punt_bio_submit(bio);
++	else
++		submit_bio(bio);
+ }
+ 
+ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
+@@ -534,10 +560,10 @@ static void run_one_async_done(struct btrfs_work *work)
+ 
+ 	/*
+ 	 * All of the bios that pass through here are from async helpers.
+-	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
+-	 * This changes nothing when cgroups aren't in use.
++	 * Use REQ_BTRFS_CGROUP_PUNT to issue them from the owning cgroup's
++	 * context.  This changes nothing when cgroups aren't in use.
+ 	 */
+-	bio->bi_opf |= REQ_CGROUP_PUNT;
++	bio->bi_opf |= REQ_BTRFS_CGROUP_PUNT;
+ 	__btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num);
+ }
+ 
+@@ -562,7 +588,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
+ 	 * in order.
+ 	 */
+ 	if (bbio->bio.bi_opf & REQ_META) {
+-		struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++		struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 
+ 		if (btrfs_is_zoned(fs_info))
+ 			return false;
+@@ -582,7 +608,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
+ 				struct btrfs_io_context *bioc,
+ 				struct btrfs_io_stripe *smap, int mirror_num)
+ {
+-	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
++	struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 	struct async_submit_bio *async;
+ 
+ 	async = kmalloc(sizeof(*async), GFP_NOFS);
+@@ -603,12 +629,12 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
+ 	return true;
+ }
+ 
+-static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
++static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num)
+ {
+-	struct btrfs_bio *bbio = btrfs_bio(bio);
+ 	struct btrfs_inode *inode = bbio->inode;
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 	struct btrfs_bio *orig_bbio = bbio;
++	struct bio *bio = &bbio->bio;
+ 	u64 logical = bio->bi_iter.bi_sector << 9;
+ 	u64 length = bio->bi_iter.bi_size;
+ 	u64 map_length = length;
+@@ -631,15 +657,15 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+ 		map_length = min(map_length, fs_info->max_zone_append_size);
+ 
+ 	if (map_length < length) {
+-		bio = btrfs_split_bio(fs_info, bio, map_length, use_append);
+-		bbio = btrfs_bio(bio);
++		bbio = btrfs_split_bio(fs_info, bbio, map_length, use_append);
++		bio = &bbio->bio;
+ 	}
+ 
+ 	/*
+ 	 * Save the iter for the end_io handler and preload the checksums for
+ 	 * data reads.
+ 	 */
+-	if (bio_op(bio) == REQ_OP_READ && !(bio->bi_opf & REQ_META)) {
++	if (bio_op(bio) == REQ_OP_READ && inode && !(bio->bi_opf & REQ_META)) {
+ 		bbio->saved_iter = bio->bi_iter;
+ 		ret = btrfs_lookup_bio_sums(bbio);
+ 		if (ret)
+@@ -650,7 +676,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+ 		if (use_append) {
+ 			bio->bi_opf &= ~REQ_OP_WRITE;
+ 			bio->bi_opf |= REQ_OP_ZONE_APPEND;
+-			ret = btrfs_extract_ordered_extent(btrfs_bio(bio));
++			ret = btrfs_bio_extract_ordered_extent(bbio);
+ 			if (ret)
+ 				goto fail_put_bio;
+ 		}
+@@ -659,7 +685,7 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+ 		 * Csum items for reloc roots have already been cloned at this
+ 		 * point, so they are handled as part of the no-checksum case.
+ 		 */
+-		if (!(inode->flags & BTRFS_INODE_NODATASUM) &&
++		if (inode && !(inode->flags & BTRFS_INODE_NODATASUM) &&
+ 		    !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+ 		    !btrfs_is_data_reloc_root(inode->root)) {
+ 			if (should_async_write(bbio) &&
+@@ -686,9 +712,12 @@ static bool btrfs_submit_chunk(struct bio *bio, int mirror_num)
+ 	return true;
+ }
+ 
+-void btrfs_submit_bio(struct bio *bio, int mirror_num)
++void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
+ {
+-	while (!btrfs_submit_chunk(bio, mirror_num))
++	/* If bbio->inode is not populated, its file_offset must be 0. */
++	ASSERT(bbio->inode || bbio->file_offset == 0);
++
++	while (!btrfs_submit_chunk(bbio, mirror_num))
+ 		;
+ }
+ 
+@@ -706,12 +735,9 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ 			    u64 length, u64 logical, struct page *page,
+ 			    unsigned int pg_offset, int mirror_num)
+ {
+-	struct btrfs_device *dev;
++	struct btrfs_io_stripe smap = { 0 };
+ 	struct bio_vec bvec;
+ 	struct bio bio;
+-	u64 map_length = 0;
+-	u64 sector;
+-	struct btrfs_io_context *bioc = NULL;
+ 	int ret = 0;
+ 
+ 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
+@@ -720,68 +746,38 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ 	if (btrfs_repair_one_zone(fs_info, logical))
+ 		return 0;
+ 
+-	map_length = length;
+-
+ 	/*
+ 	 * Avoid races with device replace and make sure our bioc has devices
+ 	 * associated to its stripes that don't go away while we are doing the
+ 	 * read repair operation.
+ 	 */
+ 	btrfs_bio_counter_inc_blocked(fs_info);
+-	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
+-		/*
+-		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
+-		 * to update all raid stripes, but here we just want to correct
+-		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
+-		 * stripe's dev and sector.
+-		 */
+-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
+-				      &map_length, &bioc, 0);
+-		if (ret)
+-			goto out_counter_dec;
+-		ASSERT(bioc->mirror_num == 1);
+-	} else {
+-		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
+-				      &map_length, &bioc, mirror_num);
+-		if (ret)
+-			goto out_counter_dec;
+-		/*
+-		 * This happens when dev-replace is also running, and the
+-		 * mirror_num indicates the dev-replace target.
+-		 *
+-		 * In this case, we don't need to do anything, as the read
+-		 * error just means the replace progress hasn't reached our
+-		 * read range, and later replace routine would handle it well.
+-		 */
+-		if (mirror_num != bioc->mirror_num)
+-			goto out_counter_dec;
+-	}
+-
+-	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
+-	dev = bioc->stripes[bioc->mirror_num - 1].dev;
+-	btrfs_put_bioc(bioc);
++	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
++	if (ret < 0)
++		goto out_counter_dec;
+ 
+-	if (!dev || !dev->bdev ||
+-	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
++	if (!smap.dev->bdev ||
++	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &smap.dev->dev_state)) {
+ 		ret = -EIO;
+ 		goto out_counter_dec;
+ 	}
+ 
+-	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
+-	bio.bi_iter.bi_sector = sector;
++	bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
++	bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
+ 	__bio_add_page(&bio, page, length, pg_offset);
+ 
+ 	btrfsic_check_bio(&bio);
+ 	ret = submit_bio_wait(&bio);
+ 	if (ret) {
+ 		/* try to remap that extent elsewhere? */
+-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
++		btrfs_dev_stat_inc_and_print(smap.dev, BTRFS_DEV_STAT_WRITE_ERRS);
+ 		goto out_bio_uninit;
+ 	}
+ 
+ 	btrfs_info_rl_in_rcu(fs_info,
+ 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
+-			     ino, start, btrfs_dev_name(dev), sector);
++			     ino, start, btrfs_dev_name(smap.dev),
++			     smap.physical >> SECTOR_SHIFT);
+ 	ret = 0;
+ 
+ out_bio_uninit:
+@@ -791,6 +787,45 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ 	return ret;
+ }
+ 
++/*
++ * Submit a btrfs_bio based repair write.
++ *
++ * If @dev_replace is true, the write would be submitted to dev-replace target.
++ */
++void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace)
++{
++	struct btrfs_fs_info *fs_info = bbio->fs_info;
++	u64 logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
++	u64 length = bbio->bio.bi_iter.bi_size;
++	struct btrfs_io_stripe smap = { 0 };
++	int ret;
++
++	ASSERT(fs_info);
++	ASSERT(mirror_num > 0);
++	ASSERT(btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE);
++	ASSERT(!bbio->inode);
++
++	btrfs_bio_counter_inc_blocked(fs_info);
++	ret = btrfs_map_repair_block(fs_info, &smap, logical, length, mirror_num);
++	if (ret < 0)
++		goto fail;
++
++	if (dev_replace) {
++		if (btrfs_op(&bbio->bio) == BTRFS_MAP_WRITE && btrfs_is_zoned(fs_info)) {
++			bbio->bio.bi_opf &= ~REQ_OP_WRITE;
++			bbio->bio.bi_opf |= REQ_OP_ZONE_APPEND;
++		}
++		ASSERT(smap.dev == fs_info->dev_replace.srcdev);
++		smap.dev = fs_info->dev_replace.tgtdev;
++	}
++	__btrfs_submit_bio(&bbio->bio, NULL, &smap, mirror_num);
++	return;
++
++fail:
++	btrfs_bio_counter_dec(fs_info);
++	btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
++}
++
+ int __init btrfs_bioset_init(void)
+ {
+ 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
+diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
+index 873ff85817f0..a8eca3a65673 100644
+--- a/fs/btrfs/bio.h
++++ b/fs/btrfs/bio.h
+@@ -30,7 +30,10 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
+  * passed to btrfs_submit_bio for mapping to the physical devices.
+  */
+ struct btrfs_bio {
+-	/* Inode and offset into it that this I/O operates on. */
++	/*
++	 * Inode and offset into it that this I/O operates on.
++	 * Only set for data I/O.
++	 */
+ 	struct btrfs_inode *inode;
+ 	u64 file_offset;
+ 
+@@ -58,6 +61,9 @@ struct btrfs_bio {
+ 	atomic_t pending_ios;
+ 	struct work_struct end_io_work;
+ 
++	/* File system that this I/O operates on. */
++	struct btrfs_fs_info *fs_info;
++
+ 	/*
+ 	 * This member must come last, bio_alloc_bioset will allocate enough
+ 	 * bytes for entire btrfs_bio but relies on bio being last.
+@@ -73,11 +79,11 @@ static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
+ int __init btrfs_bioset_init(void);
+ void __cold btrfs_bioset_exit(void);
+ 
+-void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_inode *inode,
++void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
+ 		    btrfs_bio_end_io_t end_io, void *private);
+-struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
+-			    struct btrfs_inode *inode,
+-			    btrfs_bio_end_io_t end_io, void *private);
++struct btrfs_bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
++				  struct btrfs_fs_info *fs_info,
++				  btrfs_bio_end_io_t end_io, void *private);
+ 
+ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+ {
+@@ -88,7 +94,11 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
+ /* Bio only refers to one ordered extent. */
+ #define REQ_BTRFS_ONE_ORDERED			REQ_DRV
+ 
+-void btrfs_submit_bio(struct bio *bio, int mirror_num);
++/* Submit using blkcg_punt_bio_submit. */
++#define REQ_BTRFS_CGROUP_PUNT			REQ_FS_PRIVATE
++
++void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
++void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
+ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
+ 			    u64 length, u64 logical, struct page *page,
+ 			    unsigned int pg_offset, int mirror_num);
+diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
+index 5fc670c27f86..957ad1c31c4f 100644
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -160,15 +160,6 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
+ 			btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
+ 						  cache);
+ 
+-		/*
+-		 * If not empty, someone is still holding mutex of
+-		 * full_stripe_lock, which can only be released by caller.
+-		 * And it will definitely cause use-after-free when caller
+-		 * tries to release full stripe lock.
+-		 *
+-		 * No better way to resolve, but only to warn.
+-		 */
+-		WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
+ 		kfree(cache->free_space_ctl);
+ 		kfree(cache->physical_map);
+ 		kfree(cache);
+@@ -1977,12 +1968,12 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ 
+ 	map = em->map_lookup;
+ 	data_stripe_length = em->orig_block_len;
+-	io_stripe_size = map->stripe_len;
++	io_stripe_size = BTRFS_STRIPE_LEN;
+ 	chunk_start = em->start;
+ 
+ 	/* For RAID5/6 adjust to a full IO stripe length */
+ 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+-		io_stripe_size = map->stripe_len * nr_data_stripes(map);
++		io_stripe_size = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ 
+ 	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
+ 	if (!buf) {
+@@ -1992,28 +1983,28 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+ 
+ 	for (i = 0; i < map->num_stripes; i++) {
+ 		bool already_inserted = false;
+-		u64 stripe_nr;
+-		u64 offset;
++		u32 stripe_nr;
++		u32 offset;
+ 		int j;
+ 
+ 		if (!in_range(physical, map->stripes[i].physical,
+ 			      data_stripe_length))
+ 			continue;
+ 
+-		stripe_nr = physical - map->stripes[i].physical;
+-		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
++		stripe_nr = (physical - map->stripes[i].physical) >>
++			    BTRFS_STRIPE_LEN_SHIFT;
++		offset = (physical - map->stripes[i].physical) &
++			 BTRFS_STRIPE_LEN_MASK;
+ 
+ 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+-				 BTRFS_BLOCK_GROUP_RAID10)) {
+-			stripe_nr = stripe_nr * map->num_stripes + i;
+-			stripe_nr = div_u64(stripe_nr, map->sub_stripes);
+-		}
++				 BTRFS_BLOCK_GROUP_RAID10))
++			stripe_nr = div_u64(stripe_nr * map->num_stripes + i,
++					    map->sub_stripes);
+ 		/*
+ 		 * The remaining case would be for RAID56, multiply by
+ 		 * nr_data_stripes().  Alternatively, just use rmap_len below
+ 		 * instead of map->stripe_len
+ 		 */
+-
+ 		bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
+ 
+ 		/* Ensure we don't add duplicate addresses */
+@@ -2124,8 +2115,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
+ 	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
+ 	atomic_set(&cache->frozen, 0);
+ 	mutex_init(&cache->free_space_lock);
+-	cache->full_stripe_locks_root.root = RB_ROOT;
+-	mutex_init(&cache->full_stripe_locks_root.lock);
+ 
+ 	return cache;
+ }
+@@ -2672,7 +2661,7 @@ static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
+ }
+ 
+ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+-						 u64 bytes_used, u64 type,
++						 u64 type,
+ 						 u64 chunk_offset, u64 size)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -2687,7 +2676,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
+ 
+ 	cache->length = size;
+ 	set_free_space_tree_thresholds(cache);
+-	cache->used = bytes_used;
+ 	cache->flags = type;
+ 	cache->cached = BTRFS_CACHE_FINISHED;
+ 	cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
+@@ -2738,9 +2726,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
+ 
+ #ifdef CONFIG_BTRFS_DEBUG
+ 	if (btrfs_should_fragment_free_space(cache)) {
+-		u64 new_bytes_used = size - bytes_used;
+-
+-		cache->space_info->bytes_used += new_bytes_used >> 1;
++		cache->space_info->bytes_used += size >> 1;
+ 		fragment_free_space(cache);
+ 	}
+ #endif
+diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
+index 6e4a0b429ac3..cc0e4b37db2d 100644
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -91,14 +91,6 @@ struct btrfs_caching_control {
+ /* Once caching_thread() finds this much free space, it will wake up waiters. */
+ #define CACHING_CTL_WAKE_UP SZ_2M
+ 
+-/*
+- * Tree to record all locked full stripes of a RAID5/6 block group
+- */
+-struct btrfs_full_stripe_locks_tree {
+-	struct rb_root root;
+-	struct mutex lock;
+-};
+-
+ struct btrfs_block_group {
+ 	struct btrfs_fs_info *fs_info;
+ 	struct inode *inode;
+@@ -229,9 +221,6 @@ struct btrfs_block_group {
+ 	 */
+ 	int swap_extents;
+ 
+-	/* Record locked full stripes for RAID5/6 block group */
+-	struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
+-
+ 	/*
+ 	 * Allocation offset for the block group to implement sequential
+ 	 * allocation. This is used only on a zoned filesystem.
+@@ -302,7 +291,7 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
+ int btrfs_read_block_groups(struct btrfs_fs_info *info);
+ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
+-						 u64 bytes_used, u64 type,
++						 u64 type,
+ 						 u64 chunk_offset, u64 size);
+ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans);
+ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
+diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
+index 5367a14d44d2..3ab707e26fa2 100644
+--- a/fs/btrfs/block-rsv.c
++++ b/fs/btrfs/block-rsv.c
+@@ -232,9 +232,6 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
+ 	u64 num_bytes = 0;
+ 	int ret = -ENOSPC;
+ 
+-	if (!block_rsv)
+-		return 0;
+-
+ 	spin_lock(&block_rsv->lock);
+ 	num_bytes = mult_perc(block_rsv->size, min_percent);
+ 	if (block_rsv->reserved >= num_bytes)
+@@ -245,17 +242,15 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
+ }
+ 
+ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
+-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
++			   struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ 			   enum btrfs_reserve_flush_enum flush)
+ {
+-	u64 num_bytes = 0;
+ 	int ret = -ENOSPC;
+ 
+ 	if (!block_rsv)
+ 		return 0;
+ 
+ 	spin_lock(&block_rsv->lock);
+-	num_bytes = min_reserved;
+ 	if (block_rsv->reserved >= num_bytes)
+ 		ret = 0;
+ 	else
+@@ -355,17 +350,19 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
+ 
+ 	/*
+ 	 * But we also want to reserve enough space so we can do the fallback
+-	 * global reserve for an unlink, which is an additional 5 items (see the
+-	 * comment in __unlink_start_trans for what we're modifying.)
++	 * global reserve for an unlink, which is an additional
++	 * BTRFS_UNLINK_METADATA_UNITS items.
+ 	 *
+ 	 * But we also need space for the delayed ref updates from the unlink,
+-	 * so its 10, 5 for the actual operation, and 5 for the delayed ref
+-	 * updates.
++	 * so add BTRFS_UNLINK_METADATA_UNITS units for delayed refs, one for
++	 * each unlink metadata item.
+ 	 */
+-	min_items += 10;
++	min_items += BTRFS_UNLINK_METADATA_UNITS;
+ 
+ 	num_bytes = max_t(u64, num_bytes,
+-			  btrfs_calc_insert_metadata_size(fs_info, min_items));
++			  btrfs_calc_insert_metadata_size(fs_info, min_items) +
++			  btrfs_calc_delayed_ref_bytes(fs_info,
++					       BTRFS_UNLINK_METADATA_UNITS));
+ 
+ 	spin_lock(&sinfo->lock);
+ 	spin_lock(&block_rsv->lock);
+diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
+index 4cc41c9aaa82..6dc781709aca 100644
+--- a/fs/btrfs/block-rsv.h
++++ b/fs/btrfs/block-rsv.h
+@@ -65,7 +65,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
+ 			enum btrfs_reserve_flush_enum flush);
+ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
+ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
+-			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
++			   struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+ 			   enum btrfs_reserve_flush_enum flush);
+ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+ 			    struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
+index 9dc21622806e..ec2ae4406c16 100644
+--- a/fs/btrfs/btrfs_inode.h
++++ b/fs/btrfs/btrfs_inode.h
+@@ -142,11 +142,22 @@ struct btrfs_inode {
+ 	/* a local copy of root's last_log_commit */
+ 	int last_log_commit;
+ 
+-	/*
+-	 * Total number of bytes pending delalloc, used by stat to calculate the
+-	 * real block usage of the file. This is used only for files.
+-	 */
+-	u64 delalloc_bytes;
++	union {
++		/*
++		 * Total number of bytes pending delalloc, used by stat to
++		 * calculate the real block usage of the file. This is used
++		 * only for files.
++		 */
++		u64 delalloc_bytes;
++		/*
++		 * The lowest possible index of the next dir index key which
++		 * points to an inode that needs to be logged.
++		 * This is used only for directories.
++		 * Use the helpers btrfs_get_first_dir_index_to_log() and
++		 * btrfs_set_first_dir_index_to_log() to access this field.
++		 */
++		u64 first_dir_index_to_log;
++	};
+ 
+ 	union {
+ 		/*
+@@ -247,6 +258,17 @@ struct btrfs_inode {
+ 	struct inode vfs_inode;
+ };
+ 
++static inline u64 btrfs_get_first_dir_index_to_log(const struct btrfs_inode *inode)
++{
++	return READ_ONCE(inode->first_dir_index_to_log);
++}
++
++static inline void btrfs_set_first_dir_index_to_log(struct btrfs_inode *inode,
++						    u64 index)
++{
++	WRITE_ONCE(inode->first_dir_index_to_log, index);
++}
++
+ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
+ {
+ 	return container_of(inode, struct btrfs_inode, vfs_inode);
+@@ -407,7 +429,8 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
+ 
+ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ 			    u32 pgoff, u8 *csum, const u8 * const csum_expected);
+-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio);
++int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
++				 struct btrfs_ordered_extent *ordered);
+ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
+ 			u32 bio_offset, struct bio_vec *bv);
+ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
+index f42f31f22d13..2d0493f0a184 100644
+--- a/fs/btrfs/compression.c
++++ b/fs/btrfs/compression.c
+@@ -37,6 +37,8 @@
+ #include "file-item.h"
+ #include "super.h"
+ 
++struct bio_set btrfs_compressed_bioset;
++
+ static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+ 
+ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+@@ -54,6 +56,25 @@ const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+ 	return NULL;
+ }
+ 
++static inline struct compressed_bio *to_compressed_bio(struct btrfs_bio *bbio)
++{
++	return container_of(bbio, struct compressed_bio, bbio);
++}
++
++static struct compressed_bio *alloc_compressed_bio(struct btrfs_inode *inode,
++						   u64 start, blk_opf_t op,
++						   btrfs_bio_end_io_t end_io)
++{
++	struct btrfs_bio *bbio;
++
++	bbio = btrfs_bio(bio_alloc_bioset(NULL, BTRFS_MAX_COMPRESSED_PAGES, op,
++					  GFP_NOFS, &btrfs_compressed_bioset));
++	btrfs_bio_init(bbio, inode->root->fs_info, end_io, NULL);
++	bbio->inode = inode;
++	bbio->file_offset = start;
++	return to_compressed_bio(bbio);
++}
++
+ bool btrfs_compress_is_valid_type(const char *str, size_t len)
+ {
+ 	int i;
+@@ -139,32 +160,25 @@ static int compression_decompress(int type, struct list_head *ws,
+ 	}
+ }
+ 
++static void btrfs_free_compressed_pages(struct compressed_bio *cb)
++{
++	for (unsigned int i = 0; i < cb->nr_pages; i++)
++		put_page(cb->compressed_pages[i]);
++	kfree(cb->compressed_pages);
++}
++
+ static int btrfs_decompress_bio(struct compressed_bio *cb);
+ 
+ static void end_compressed_bio_read(struct btrfs_bio *bbio)
+ {
+-	struct compressed_bio *cb = bbio->private;
+-	unsigned int index;
+-	struct page *page;
++	struct compressed_bio *cb = to_compressed_bio(bbio);
++	blk_status_t status = bbio->bio.bi_status;
+ 
+-	if (bbio->bio.bi_status)
+-		cb->status = bbio->bio.bi_status;
+-	else
+-		cb->status = errno_to_blk_status(btrfs_decompress_bio(cb));
+-
+-	/* Release the compressed pages */
+-	for (index = 0; index < cb->nr_pages; index++) {
+-		page = cb->compressed_pages[index];
+-		page->mapping = NULL;
+-		put_page(page);
+-	}
+-
+-	/* Do io completion on the original bio */
+-	btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
++	if (!status)
++		status = errno_to_blk_status(btrfs_decompress_bio(cb));
+ 
+-	/* Finally free the cb struct */
+-	kfree(cb->compressed_pages);
+-	kfree(cb);
++	btrfs_free_compressed_pages(cb);
++	btrfs_bio_end_io(cb->orig_bbio, status);
+ 	bio_put(&bbio->bio);
+ }
+ 
+@@ -172,14 +186,14 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
+  * Clear the writeback bits on all of the file
+  * pages for a compressed write
+  */
+-static noinline void end_compressed_writeback(struct inode *inode,
+-					      const struct compressed_bio *cb)
++static noinline void end_compressed_writeback(const struct compressed_bio *cb)
+ {
++	struct inode *inode = &cb->bbio.inode->vfs_inode;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	unsigned long index = cb->start >> PAGE_SHIFT;
+ 	unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
+ 	struct folio_batch fbatch;
+-	const int errno = blk_status_to_errno(cb->status);
++	const int errno = blk_status_to_errno(cb->bbio.bio.bi_status);
+ 	int i;
+ 	int ret;
+ 
+@@ -207,45 +221,25 @@ static noinline void end_compressed_writeback(struct inode *inode,
+ 	/* the inode may be gone now */
+ }
+ 
+-static void finish_compressed_bio_write(struct compressed_bio *cb)
++static void btrfs_finish_compressed_write_work(struct work_struct *work)
+ {
+-	struct inode *inode = cb->inode;
+-	unsigned int index;
++	struct compressed_bio *cb =
++		container_of(work, struct compressed_bio, write_end_work);
+ 
+ 	/*
+ 	 * Ok, we're the last bio for this extent, step one is to call back
+ 	 * into the FS and do all the end_io operations.
+ 	 */
+-	btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
++	btrfs_writepage_endio_finish_ordered(cb->bbio.inode, NULL,
+ 			cb->start, cb->start + cb->len - 1,
+-			cb->status == BLK_STS_OK);
++			cb->bbio.bio.bi_status == BLK_STS_OK);
+ 
+ 	if (cb->writeback)
+-		end_compressed_writeback(inode, cb);
++		end_compressed_writeback(cb);
+ 	/* Note, our inode could be gone now */
+ 
+-	/*
+-	 * Release the compressed pages, these came from alloc_page and
+-	 * are not attached to the inode at all
+-	 */
+-	for (index = 0; index < cb->nr_pages; index++) {
+-		struct page *page = cb->compressed_pages[index];
+-
+-		page->mapping = NULL;
+-		put_page(page);
+-	}
+-
+-	/* Finally free the cb struct */
+-	kfree(cb->compressed_pages);
+-	kfree(cb);
+-}
+-
+-static void btrfs_finish_compressed_write_work(struct work_struct *work)
+-{
+-	struct compressed_bio *cb =
+-		container_of(work, struct compressed_bio, write_end_work);
+-
+-	finish_compressed_bio_write(cb);
++	btrfs_free_compressed_pages(cb);
++	bio_put(&cb->bbio.bio);
+ }
+ 
+ /*
+@@ -257,13 +251,25 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
+  */
+ static void end_compressed_bio_write(struct btrfs_bio *bbio)
+ {
+-	struct compressed_bio *cb = bbio->private;
+-	struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
++	struct compressed_bio *cb = to_compressed_bio(bbio);
++	struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
+ 
+-	cb->status = bbio->bio.bi_status;
+ 	queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
++}
+ 
+-	bio_put(&bbio->bio);
++static void btrfs_add_compressed_bio_pages(struct compressed_bio *cb)
++{
++	struct bio *bio = &cb->bbio.bio;
++	u32 offset = 0;
++
++	while (offset < cb->compressed_len) {
++		u32 len = min_t(u32, cb->compressed_len - offset, PAGE_SIZE);
++
++		/* Maximum compressed extent is smaller than bio size limit. */
++		__bio_add_page(bio, cb->compressed_pages[offset >> PAGE_SHIFT],
++			       len, 0);
++		offset += len;
++	}
+ }
+ 
+ /*
+@@ -275,28 +281,24 @@ static void end_compressed_bio_write(struct btrfs_bio *bbio)
+  * This also checksums the file bytes and gets things ready for
+  * the end io hooks.
+  */
+-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
++void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 				 unsigned int len, u64 disk_start,
+ 				 unsigned int compressed_len,
+ 				 struct page **compressed_pages,
+ 				 unsigned int nr_pages,
+ 				 blk_opf_t write_flags,
+-				 struct cgroup_subsys_state *blkcg_css,
+ 				 bool writeback)
+ {
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct bio *bio = NULL;
+ 	struct compressed_bio *cb;
+-	u64 cur_disk_bytenr = disk_start;
+-	blk_status_t ret = BLK_STS_OK;
+ 
+ 	ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
+ 	       IS_ALIGNED(len, fs_info->sectorsize));
+-	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+-	if (!cb)
+-		return BLK_STS_RESOURCE;
+-	cb->status = BLK_STS_OK;
+-	cb->inode = &inode->vfs_inode;
++
++	write_flags |= REQ_BTRFS_ONE_ORDERED;
++
++	cb = alloc_compressed_bio(inode, start, REQ_OP_WRITE | write_flags,
++				  end_compressed_bio_write);
+ 	cb->start = start;
+ 	cb->len = len;
+ 	cb->compressed_pages = compressed_pages;
+@@ -304,56 +306,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 	cb->writeback = writeback;
+ 	INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work);
+ 	cb->nr_pages = nr_pages;
++	cb->bbio.bio.bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
++	btrfs_add_compressed_bio_pages(cb);
+ 
+-	if (blkcg_css) {
+-		kthread_associate_blkcg(blkcg_css);
+-		write_flags |= REQ_CGROUP_PUNT;
+-	}
+-
+-	write_flags |= REQ_BTRFS_ONE_ORDERED;
+-	bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_WRITE | write_flags,
+-			      BTRFS_I(cb->inode), end_compressed_bio_write, cb);
+-	bio->bi_iter.bi_sector = cur_disk_bytenr >> SECTOR_SHIFT;
+-	btrfs_bio(bio)->file_offset = start;
+-
+-	while (cur_disk_bytenr < disk_start + compressed_len) {
+-		u64 offset = cur_disk_bytenr - disk_start;
+-		unsigned int index = offset >> PAGE_SHIFT;
+-		unsigned int real_size;
+-		unsigned int added;
+-		struct page *page = compressed_pages[index];
+-
+-		/*
+-		 * We have various limits on the real read size:
+-		 * - page boundary
+-		 * - compressed length boundary
+-		 */
+-		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
+-		real_size = min_t(u64, real_size, compressed_len - offset);
+-		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+-
+-		added = bio_add_page(bio, page, real_size, offset_in_page(offset));
+-		/*
+-		 * Maximum compressed extent is smaller than bio size limit,
+-		 * thus bio_add_page() should always success.
+-		 */
+-		ASSERT(added == real_size);
+-		cur_disk_bytenr += added;
+-	}
+-
+-	/* Finished the range. */
+-	ASSERT(bio->bi_iter.bi_size);
+-	btrfs_submit_bio(bio, 0);
+-	if (blkcg_css)
+-		kthread_associate_blkcg(NULL);
+-	return ret;
+-}
+-
+-static u64 bio_end_offset(struct bio *bio)
+-{
+-	struct bio_vec *last = bio_last_bvec_all(bio);
+-
+-	return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
++	btrfs_submit_bio(&cb->bbio, 0);
+ }
+ 
+ /*
+@@ -374,7 +330,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	unsigned long end_index;
+-	u64 cur = bio_end_offset(cb->orig_bio);
++	struct bio *orig_bio = &cb->orig_bbio->bio;
++	u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size;
+ 	u64 isize = i_size_read(inode);
+ 	int ret;
+ 	struct page *page;
+@@ -464,7 +421,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ 		 */
+ 		if (!em || cur < em->start ||
+ 		    (cur + fs_info->sectorsize > extent_map_end(em)) ||
+-		    (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
++		    (em->block_start >> 9) != orig_bio->bi_iter.bi_sector) {
+ 			free_extent_map(em);
+ 			unlock_extent(tree, cur, page_end, NULL);
+ 			unlock_page(page);
+@@ -484,7 +441,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+ 		}
+ 
+ 		add_size = min(em->start + em->len, page_end + 1) - cur;
+-		ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
++		ret = bio_add_page(orig_bio, page, add_size, offset_in_page(cur));
+ 		if (ret != add_size) {
+ 			unlock_extent(tree, cur, page_end, NULL);
+ 			unlock_page(page);
+@@ -515,17 +472,14 @@ static noinline int add_ra_bio_pages(struct inode *inode,
+  * After the compressed pages are read, we copy the bytes into the
+  * bio we were passed and then call the bio end_io calls
+  */
+-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+-				  int mirror_num)
++void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num)
+ {
+-	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	struct extent_map_tree *em_tree;
++	struct btrfs_inode *inode = bbio->inode;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct extent_map_tree *em_tree = &inode->extent_tree;
+ 	struct compressed_bio *cb;
+ 	unsigned int compressed_len;
+-	struct bio *comp_bio;
+-	const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+-	u64 cur_disk_byte = disk_bytenr;
+-	u64 file_offset;
++	u64 file_offset = bbio->file_offset;
+ 	u64 em_len;
+ 	u64 em_start;
+ 	struct extent_map *em;
+@@ -533,12 +487,6 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 	int memstall = 0;
+ 	blk_status_t ret;
+ 	int ret2;
+-	int i;
+-
+-	em_tree = &BTRFS_I(inode)->extent_tree;
+-
+-	file_offset = bio_first_bvec_all(bio)->bv_offset +
+-		      page_offset(bio_first_page_all(bio));
+ 
+ 	/* we need the actual starting offset of this extent in the file */
+ 	read_lock(&em_tree->lock);
+@@ -551,102 +499,54 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+ 
+ 	ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ 	compressed_len = em->block_len;
+-	cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS);
+-	if (!cb) {
+-		ret = BLK_STS_RESOURCE;
+-		goto out;
+-	}
+ 
+-	cb->status = BLK_STS_OK;
+-	cb->inode = inode;
++	cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
++				  end_compressed_bio_read);
+ 
+ 	cb->start = em->orig_start;
+ 	em_len = em->len;
+ 	em_start = em->start;
+ 
+-	cb->len = bio->bi_iter.bi_size;
++	cb->len = bbio->bio.bi_iter.bi_size;
+ 	cb->compressed_len = compressed_len;
+ 	cb->compress_type = em->compress_type;
+-	cb->orig_bio = bio;
++	cb->orig_bbio = bbio;
+ 
+ 	free_extent_map(em);
+-	em = NULL;
+ 
+ 	cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
+ 	cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
+ 	if (!cb->compressed_pages) {
+ 		ret = BLK_STS_RESOURCE;
+-		goto fail;
++		goto out_free_bio;
+ 	}
+ 
+ 	ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ 	if (ret2) {
+ 		ret = BLK_STS_RESOURCE;
+-		goto fail;
++		goto out_free_compressed_pages;
+ 	}
+ 
+-	add_ra_bio_pages(inode, em_start + em_len, cb, &memstall, &pflags);
++	add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall,
++			 &pflags);
+ 
+ 	/* include any pages we added in add_ra-bio_pages */
+-	cb->len = bio->bi_iter.bi_size;
+-
+-	comp_bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, BTRFS_I(cb->inode),
+-				   end_compressed_bio_read, cb);
+-	comp_bio->bi_iter.bi_sector = (cur_disk_byte >> SECTOR_SHIFT);
+-
+-	while (cur_disk_byte < disk_bytenr + compressed_len) {
+-		u64 offset = cur_disk_byte - disk_bytenr;
+-		unsigned int index = offset >> PAGE_SHIFT;
+-		unsigned int real_size;
+-		unsigned int added;
+-		struct page *page = cb->compressed_pages[index];
+-
+-		/*
+-		 * We have various limit on the real read size:
+-		 * - page boundary
+-		 * - compressed length boundary
+-		 */
+-		real_size = min_t(u64, U32_MAX, PAGE_SIZE - offset_in_page(offset));
+-		real_size = min_t(u64, real_size, compressed_len - offset);
+-		ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize));
+-
+-		added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset));
+-		/*
+-		 * Maximum compressed extent is smaller than bio size limit,
+-		 * thus bio_add_page() should always success.
+-		 */
+-		ASSERT(added == real_size);
+-		cur_disk_byte += added;
+-	}
++	cb->len = bbio->bio.bi_iter.bi_size;
++	cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector;
++	btrfs_add_compressed_bio_pages(cb);
+ 
+ 	if (memstall)
+ 		psi_memstall_leave(&pflags);
+ 
+-	/*
+-	 * Stash the initial offset of this chunk, as there is no direct
+-	 * correlation between compressed pages and the original file offset.
+-	 * The field is only used for printing error messages anyway.
+-	 */
+-	btrfs_bio(comp_bio)->file_offset = file_offset;
+-
+-	ASSERT(comp_bio->bi_iter.bi_size);
+-	btrfs_submit_bio(comp_bio, mirror_num);
++	btrfs_submit_bio(&cb->bbio, mirror_num);
+ 	return;
+ 
+-fail:
+-	if (cb->compressed_pages) {
+-		for (i = 0; i < cb->nr_pages; i++) {
+-			if (cb->compressed_pages[i])
+-				__free_page(cb->compressed_pages[i]);
+-		}
+-	}
+-
++out_free_compressed_pages:
+ 	kfree(cb->compressed_pages);
+-	kfree(cb);
++out_free_bio:
++	bio_put(&cb->bbio.bio);
+ out:
+-	free_extent_map(em);
+-	btrfs_bio_end_io(btrfs_bio(bio), ret);
+-	return;
++	btrfs_bio_end_io(bbio, ret);
+ }
+ 
+ /*
+@@ -1038,6 +938,8 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
+ 	ret = compression_decompress_bio(workspace, cb);
+ 	put_workspace(type, workspace);
+ 
++	if (!ret)
++		zero_fill_bio(&cb->orig_bbio->bio);
+ 	return ret;
+ }
+ 
+@@ -1062,6 +964,10 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+ 
+ int __init btrfs_init_compress(void)
+ {
++	if (bioset_init(&btrfs_compressed_bioset, BIO_POOL_SIZE,
++			offsetof(struct compressed_bio, bbio.bio),
++			BIOSET_NEED_BVECS))
++		return -ENOMEM;
+ 	btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
+ 	btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ 	btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
+@@ -1075,6 +981,7 @@ void __cold btrfs_exit_compress(void)
+ 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
+ 	btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
+ 	zstd_cleanup_workspace_manager();
++	bioset_exit(&btrfs_compressed_bioset);
+ }
+ 
+ /*
+@@ -1110,7 +1017,7 @@ void __cold btrfs_exit_compress(void)
+ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ 			      struct compressed_bio *cb, u32 decompressed)
+ {
+-	struct bio *orig_bio = cb->orig_bio;
++	struct bio *orig_bio = &cb->orig_bbio->bio;
+ 	/* Offset inside the full decompressed extent */
+ 	u32 cur_offset;
+ 
+diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
+index a5e3377db9ad..19ab2abeddc0 100644
+--- a/fs/btrfs/compression.h
++++ b/fs/btrfs/compression.h
+@@ -6,8 +6,8 @@
+ #ifndef BTRFS_COMPRESSION_H
+ #define BTRFS_COMPRESSION_H
+ 
+-#include <linux/blk_types.h>
+ #include <linux/sizes.h>
++#include "bio.h"
+ 
+ struct btrfs_inode;
+ 
+@@ -23,6 +23,7 @@ struct btrfs_inode;
+ 
+ /* Maximum length of compressed data stored on disk */
+ #define BTRFS_MAX_COMPRESSED		(SZ_128K)
++#define BTRFS_MAX_COMPRESSED_PAGES	(BTRFS_MAX_COMPRESSED / PAGE_SIZE)
+ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
+ 
+ /* Maximum size of data before compression */
+@@ -37,9 +38,6 @@ struct compressed_bio {
+ 	/* the pages with the compressed data on them */
+ 	struct page **compressed_pages;
+ 
+-	/* inode that owns this data */
+-	struct inode *inode;
+-
+ 	/* starting offset in the inode for our pages */
+ 	u64 start;
+ 
+@@ -55,14 +53,14 @@ struct compressed_bio {
+ 	/* Whether this is a write for writeback. */
+ 	bool writeback;
+ 
+-	/* IO errors */
+-	blk_status_t status;
+-
+ 	union {
+ 		/* For reads, this is the bio we are copying the data into */
+-		struct bio *orig_bio;
++		struct btrfs_bio *orig_bbio;
+ 		struct work_struct write_end_work;
+ 	};
++
++	/* Must be last. */
++	struct btrfs_bio bbio;
+ };
+ 
+ static inline unsigned int btrfs_compress_type(unsigned int type_level)
+@@ -88,16 +86,14 @@ int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
+ int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
+ 			      struct compressed_bio *cb, u32 decompressed);
+ 
+-blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
++void btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
+ 				  unsigned int len, u64 disk_start,
+ 				  unsigned int compressed_len,
+ 				  struct page **compressed_pages,
+ 				  unsigned int nr_pages,
+ 				  blk_opf_t write_flags,
+-				  struct cgroup_subsys_state *blkcg_css,
+ 				  bool writeback);
+-void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
+-				  int mirror_num);
++void btrfs_submit_compressed_read(struct btrfs_bio *bbio, int mirror_num);
+ 
+ unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+ 
+diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
+index a5b6bb54545f..3c983c70028a 100644
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -854,7 +854,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+  * Search for a key in the given extent_buffer.
+  *
+  * The lower boundary for the search is specified by the slot number @first_slot.
+- * Use a value of 0 to search over the whole extent buffer.
++ * Use a value of 0 to search over the whole extent buffer. Works for both
++ * leaves and nodes.
+  *
+  * The slot in the extent buffer is returned via @slot. If the key exists in the
+  * extent buffer, then @slot will point to the slot where the key is, otherwise
+@@ -863,8 +864,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
+  * Slot may point to the total number of items (i.e. one position beyond the last
+  * key) if the key is bigger than the last key in the extent buffer.
+  */
+-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
+-			     const struct btrfs_key *key, int *slot)
++int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
++		     const struct btrfs_key *key, int *slot)
+ {
+ 	unsigned long p;
+ 	int item_size;
+@@ -959,7 +960,7 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
+ 	if (slot < 0 || slot >= btrfs_header_nritems(parent))
+ 		return ERR_PTR(-ENOENT);
+ 
+-	BUG_ON(level == 0);
++	ASSERT(level);
+ 
+ 	check.level = level - 1;
+ 	check.transid = btrfs_node_ptr_generation(parent, slot);
+@@ -1064,11 +1065,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
+ 	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
+ 		return 0;
+ 
+-	left = btrfs_read_node_slot(parent, pslot - 1);
+-	if (IS_ERR(left))
+-		left = NULL;
++	if (pslot) {
++		left = btrfs_read_node_slot(parent, pslot - 1);
++		if (IS_ERR(left)) {
++			ret = PTR_ERR(left);
++			left = NULL;
++			goto enospc;
++		}
+ 
+-	if (left) {
+ 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ 		wret = btrfs_cow_block(trans, root, left,
+ 				       parent, pslot - 1, &left,
+@@ -1079,11 +1083,14 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
+ 		}
+ 	}
+ 
+-	right = btrfs_read_node_slot(parent, pslot + 1);
+-	if (IS_ERR(right))
+-		right = NULL;
++	if (pslot + 1 < btrfs_header_nritems(parent)) {
++		right = btrfs_read_node_slot(parent, pslot + 1);
++		if (IS_ERR(right)) {
++			ret = PTR_ERR(right);
++			right = NULL;
++			goto enospc;
++		}
+ 
+-	if (right) {
+ 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ 		wret = btrfs_cow_block(trans, root, right,
+ 				       parent, pslot + 1, &right,
+@@ -1240,14 +1247,14 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ 	if (!parent)
+ 		return 1;
+ 
+-	left = btrfs_read_node_slot(parent, pslot - 1);
+-	if (IS_ERR(left))
+-		left = NULL;
+-
+ 	/* first, try to make some room in the middle buffer */
+-	if (left) {
++	if (pslot) {
+ 		u32 left_nr;
+ 
++		left = btrfs_read_node_slot(parent, pslot - 1);
++		if (IS_ERR(left))
++			return PTR_ERR(left);
++
+ 		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ 
+ 		left_nr = btrfs_header_nritems(left);
+@@ -1292,16 +1299,17 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
+ 		btrfs_tree_unlock(left);
+ 		free_extent_buffer(left);
+ 	}
+-	right = btrfs_read_node_slot(parent, pslot + 1);
+-	if (IS_ERR(right))
+-		right = NULL;
+ 
+ 	/*
+ 	 * then try to empty the right most buffer into the middle
+ 	 */
+-	if (right) {
++	if (pslot + 1 < btrfs_header_nritems(parent)) {
+ 		u32 right_nr;
+ 
++		right = btrfs_read_node_slot(parent, pslot + 1);
++		if (IS_ERR(right))
++			return PTR_ERR(right);
++
+ 		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ 
+ 		right_nr = btrfs_header_nritems(right);
+@@ -1864,7 +1872,7 @@ static inline int search_for_key_slot(struct extent_buffer *eb,
+ 		return 0;
+ 	}
+ 
+-	return btrfs_generic_bin_search(eb, search_low_slot, key, slot);
++	return btrfs_bin_search(eb, search_low_slot, key, slot);
+ }
+ 
+ static int search_leaf(struct btrfs_trans_handle *trans,
+@@ -2321,7 +2329,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
+ 		 */
+ 		btrfs_unlock_up_safe(p, level + 1);
+ 
+-		ret = btrfs_bin_search(b, key, &slot);
++		ret = btrfs_bin_search(b, 0, key, &slot);
+ 		if (ret < 0)
+ 			goto done;
+ 
+@@ -2482,26 +2490,15 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
+ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
+ 			      struct btrfs_path *path)
+ {
+-	while (1) {
++	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ 		int ret;
+-		const int slot = path->slots[0];
+-		const struct extent_buffer *leaf = path->nodes[0];
+ 
+-		/* This is where we start walking the path. */
+-		if (slot >= btrfs_header_nritems(leaf)) {
+-			/*
+-			 * If we've reached the last slot in this leaf we need
+-			 * to go to the next leaf and reset the path.
+-			 */
+-			ret = btrfs_next_leaf(root, path);
+-			if (ret)
+-				return ret;
+-			continue;
+-		}
+-		/* Store the found, valid item in @key. */
+-		btrfs_item_key_to_cpu(leaf, key, slot);
+-		break;
++		ret = btrfs_next_leaf(root, path);
++		if (ret)
++			return ret;
+ 	}
++
++	btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
+ 	return 0;
+ }
+ 
+@@ -3198,12 +3195,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ 	btrfs_assert_tree_write_locked(path->nodes[1]);
+ 
+ 	right = btrfs_read_node_slot(upper, slot + 1);
+-	/*
+-	 * slot + 1 is not valid or we fail to read the right node,
+-	 * no big deal, just return.
+-	 */
+ 	if (IS_ERR(right))
+-		return 1;
++		return PTR_ERR(right);
+ 
+ 	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
+ 
+@@ -3417,12 +3410,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ 	btrfs_assert_tree_write_locked(path->nodes[1]);
+ 
+ 	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
+-	/*
+-	 * slot - 1 is not valid or we fail to read the left node,
+-	 * no big deal, just return.
+-	 */
+ 	if (IS_ERR(left))
+-		return 1;
++		return PTR_ERR(left);
+ 
+ 	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
+ 
+@@ -4576,7 +4565,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
+ 	while (1) {
+ 		nritems = btrfs_header_nritems(cur);
+ 		level = btrfs_header_level(cur);
+-		sret = btrfs_bin_search(cur, min_key, &slot);
++		sret = btrfs_bin_search(cur, 0, min_key, &slot);
+ 		if (sret < 0) {
+ 			ret = sret;
+ 			goto out;
+diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
+index 97897107fab5..4c1986cd5bed 100644
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -508,22 +508,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range);
+ int __init btrfs_ctree_init(void);
+ void __cold btrfs_ctree_exit(void);
+ 
+-int btrfs_generic_bin_search(struct extent_buffer *eb, int first_slot,
+-			     const struct btrfs_key *key, int *slot);
++int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
++		     const struct btrfs_key *key, int *slot);
+ 
+-/*
+- * Simple binary search on an extent buffer. Works for both leaves and nodes, and
+- * always searches over the whole range of keys (slot 0 to slot 'nritems - 1').
+- */
+-static inline int btrfs_bin_search(struct extent_buffer *eb,
+-				   const struct btrfs_key *key,
+-				   int *slot)
+-{
+-	return btrfs_generic_bin_search(eb, 0, key, slot);
+-}
+-
+-int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
+-		     int *slot);
+ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2);
+ int btrfs_previous_item(struct btrfs_root *root,
+ 			struct btrfs_path *path, u64 min_objectid,
+diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
+index 7ddb1d104e8e..427abaf608b8 100644
+--- a/fs/btrfs/delalloc-space.c
++++ b/fs/btrfs/delalloc-space.c
+@@ -358,8 +358,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+ 	 * racing with an ordered completion or some such that would think it
+ 	 * needs to free the reservation we just made.
+ 	 */
+-	spin_lock(&inode->lock);
+ 	nr_extents = count_max_extents(fs_info, num_bytes);
++	spin_lock(&inode->lock);
+ 	btrfs_mod_outstanding_extents(inode, nr_extents);
+ 	inode->csum_bytes += disk_num_bytes;
+ 	btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
+index 886ffb232eac..0b32432d7d56 100644
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -53,24 +53,6 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
+ 	return ret;
+ }
+ 
+-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
+-{
+-	u64 num_entries =
+-		atomic_read(&trans->transaction->delayed_refs.num_entries);
+-	u64 avg_runtime;
+-	u64 val;
+-
+-	smp_mb();
+-	avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
+-	val = num_entries * avg_runtime;
+-	if (val >= NSEC_PER_SEC)
+-		return 1;
+-	if (val >= NSEC_PER_SEC / 2)
+-		return 2;
+-
+-	return btrfs_check_space_for_delayed_refs(trans->fs_info);
+-}
+-
+ /*
+  * Release a ref head's reservation.
+  *
+@@ -83,20 +65,9 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
+ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
+ {
+ 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+-	u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
++	const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr);
+ 	u64 released = 0;
+ 
+-	/*
+-	 * We have to check the mount option here because we could be enabling
+-	 * the free space tree for the first time and don't have the compat_ro
+-	 * option set yet.
+-	 *
+-	 * We need extra reservations if we have the free space tree because
+-	 * we'll have to modify that tree as well.
+-	 */
+-	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+-		num_bytes *= 2;
+-
+ 	released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
+ 	if (released)
+ 		trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+@@ -118,18 +89,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
+ 	if (!trans->delayed_ref_updates)
+ 		return;
+ 
+-	num_bytes = btrfs_calc_insert_metadata_size(fs_info,
+-						    trans->delayed_ref_updates);
+-	/*
+-	 * We have to check the mount option here because we could be enabling
+-	 * the free space tree for the first time and don't have the compat_ro
+-	 * option set yet.
+-	 *
+-	 * We need extra reservations if we have the free space tree because
+-	 * we'll have to modify that tree as well.
+-	 */
+-	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+-		num_bytes *= 2;
++	num_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
++						 trans->delayed_ref_updates);
+ 
+ 	spin_lock(&delayed_rsv->lock);
+ 	delayed_rsv->size += num_bytes;
+@@ -200,7 +161,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ 				  enum btrfs_reserve_flush_enum flush)
+ {
+ 	struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
+-	u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
++	u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1);
+ 	u64 num_bytes = 0;
+ 	int ret = -ENOSPC;
+ 
+@@ -217,7 +178,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ 	ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
+ 	if (ret)
+ 		return ret;
+-	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
++	btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false);
+ 	trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
+ 				      0, num_bytes, 1);
+ 	return 0;
+diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
+index 2eb34abf700f..b54261fe509b 100644
+--- a/fs/btrfs/delayed-ref.h
++++ b/fs/btrfs/delayed-ref.h
+@@ -253,6 +253,27 @@ extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+ int __init btrfs_delayed_ref_init(void);
+ void __cold btrfs_delayed_ref_exit(void);
+ 
++static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_info,
++					       int num_delayed_refs)
++{
++	u64 num_bytes;
++
++	num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_delayed_refs);
++
++	/*
++	 * We have to check the mount option here because we could be enabling
++	 * the free space tree for the first time and don't have the compat_ro
++	 * option set yet.
++	 *
++	 * We need extra reservations if we have the free space tree because
++	 * we'll have to modify that tree as well.
++	 */
++	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
++		num_bytes *= 2;
++
++	return num_bytes;
++}
++
+ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref,
+ 				int action, u64 bytenr, u64 len, u64 parent)
+ {
+@@ -385,7 +406,6 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
+ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
+ 				       struct btrfs_block_rsv *src,
+ 				       u64 num_bytes);
+-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
+ 
+ /*
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index 9e1596bb208d..59ea049fe7ee 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1341,17 +1341,8 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
+ {
+ 	int ret;
+-	unsigned int nofs_flag;
+ 
+-	/*
+-	 * We might be called under a transaction (e.g. indirect backref
+-	 * resolution) which could deadlock if it triggers memory reclaim
+-	 */
+-	nofs_flag = memalloc_nofs_save();
+-	ret = btrfs_drew_lock_init(&root->snapshot_lock);
+-	memalloc_nofs_restore(nofs_flag);
+-	if (ret)
+-		goto fail;
++	btrfs_drew_lock_init(&root->snapshot_lock);
+ 
+ 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
+ 	    !btrfs_is_data_reloc_root(root)) {
+@@ -2065,7 +2056,6 @@ void btrfs_put_root(struct btrfs_root *root)
+ 		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
+ 		if (root->anon_dev)
+ 			free_anon_bdev(root->anon_dev);
+-		btrfs_drew_lock_destroy(&root->snapshot_lock);
+ 		free_root_extent_buffers(root);
+ #ifdef CONFIG_BTRFS_DEBUG
+ 		spin_lock(&root->fs_info->fs_roots_radix_lock);
+@@ -2125,11 +2115,16 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
+ 	atomic_set(&fs_info->reloc_cancel_req, 0);
+ }
+ 
+-static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
++static int btrfs_init_btree_inode(struct super_block *sb)
+ {
+-	struct inode *inode = fs_info->btree_inode;
++	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ 	unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
+ 					      fs_info->tree_root);
++	struct inode *inode;
++
++	inode = new_inode(sb);
++	if (!inode)
++		return -ENOMEM;
+ 
+ 	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
+ 	set_nlink(inode, 1);
+@@ -2140,6 +2135,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
+ 	 */
+ 	inode->i_size = OFFSET_MAX;
+ 	inode->i_mapping->a_ops = &btree_aops;
++	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ 
+ 	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+ 	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
+@@ -2152,6 +2148,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
+ 	BTRFS_I(inode)->location.offset = 0;
+ 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
+ 	__insert_inode_hash(inode, hash);
++	fs_info->btree_inode = inode;
++
++	return 0;
+ }
+ 
+ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
+@@ -2966,7 +2965,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
+ 	atomic64_set(&fs_info->free_chunk_space, 0);
+ 	fs_info->tree_mod_log = RB_ROOT;
+ 	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+-	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
+ 	btrfs_init_ref_verify(fs_info);
+ 
+ 	fs_info->thread_pool_size = min_t(unsigned long,
+@@ -3344,14 +3342,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	struct btrfs_root *tree_root;
+ 	struct btrfs_root *chunk_root;
+ 	int ret;
+-	int err = -EINVAL;
+ 	int level;
+ 
+ 	ret = init_mount_fs_info(fs_info, sb);
+-	if (ret) {
+-		err = ret;
++	if (ret)
+ 		goto fail;
+-	}
+ 
+ 	/* These need to be init'ed before we start creating inodes and such. */
+ 	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
+@@ -3361,17 +3356,13 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 				      GFP_KERNEL);
+ 	fs_info->chunk_root = chunk_root;
+ 	if (!tree_root || !chunk_root) {
+-		err = -ENOMEM;
++		ret = -ENOMEM;
+ 		goto fail;
+ 	}
+ 
+-	fs_info->btree_inode = new_inode(sb);
+-	if (!fs_info->btree_inode) {
+-		err = -ENOMEM;
++	ret = btrfs_init_btree_inode(sb);
++	if (ret)
+ 		goto fail;
+-	}
+-	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
+-	btrfs_init_btree_inode(fs_info);
+ 
+ 	invalidate_bdev(fs_devices->latest_dev->bdev);
+ 
+@@ -3380,7 +3371,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	 */
+ 	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
+ 	if (IS_ERR(disk_super)) {
+-		err = PTR_ERR(disk_super);
++		ret = PTR_ERR(disk_super);
+ 		goto fail_alloc;
+ 	}
+ 
+@@ -3392,7 +3383,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	if (!btrfs_supported_super_csum(csum_type)) {
+ 		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
+ 			  csum_type);
+-		err = -EINVAL;
++		ret = -EINVAL;
+ 		btrfs_release_disk_super(disk_super);
+ 		goto fail_alloc;
+ 	}
+@@ -3401,7 +3392,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 
+ 	ret = btrfs_init_csum_hash(fs_info, csum_type);
+ 	if (ret) {
+-		err = ret;
+ 		btrfs_release_disk_super(disk_super);
+ 		goto fail_alloc;
+ 	}
+@@ -3412,7 +3402,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	 */
+ 	if (btrfs_check_super_csum(fs_info, disk_super)) {
+ 		btrfs_err(fs_info, "superblock checksum mismatch");
+-		err = -EINVAL;
++		ret = -EINVAL;
+ 		btrfs_release_disk_super(disk_super);
+ 		goto fail_alloc;
+ 	}
+@@ -3442,12 +3432,15 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	ret = btrfs_validate_mount_super(fs_info);
+ 	if (ret) {
+ 		btrfs_err(fs_info, "superblock contains fatal errors");
+-		err = -EINVAL;
++		ret = -EINVAL;
+ 		goto fail_alloc;
+ 	}
+ 
+-	if (!btrfs_super_root(disk_super))
++	if (!btrfs_super_root(disk_super)) {
++		btrfs_err(fs_info, "invalid superblock tree root bytenr");
++		ret = -EINVAL;
+ 		goto fail_alloc;
++	}
+ 
+ 	/* check FS state, whether FS is broken. */
+ 	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+@@ -3474,16 +3467,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	fs_info->stripesize = stripesize;
+ 
+ 	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
+-	if (ret) {
+-		err = ret;
++	if (ret)
+ 		goto fail_alloc;
+-	}
+ 
+ 	ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
+-	if (ret < 0) {
+-		err = ret;
++	if (ret < 0)
+ 		goto fail_alloc;
+-	}
+ 
+ 	if (sectorsize < PAGE_SIZE) {
+ 		struct btrfs_subpage_info *subpage_info;
+@@ -3503,17 +3492,17 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 		"read-write for sector size %u with page size %lu is experimental",
+ 			   sectorsize, PAGE_SIZE);
+ 		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
+-		if (!subpage_info)
++		if (!subpage_info) {
++			ret = -ENOMEM;
+ 			goto fail_alloc;
++		}
+ 		btrfs_init_subpage_info(subpage_info, sectorsize);
+ 		fs_info->subpage_info = subpage_info;
+ 	}
+ 
+ 	ret = btrfs_init_workqueues(fs_info);
+-	if (ret) {
+-		err = ret;
++	if (ret)
+ 		goto fail_sb_buffer;
+-	}
+ 
+ 	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
+ 	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
+@@ -3559,6 +3548,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	btrfs_free_extra_devids(fs_devices);
+ 	if (!fs_devices->latest_dev->bdev) {
+ 		btrfs_err(fs_info, "failed to read devices");
++		ret = -EIO;
+ 		goto fail_tree_roots;
+ 	}
+ 
+@@ -3574,8 +3564,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
+ 	if (ret) {
+ 		btrfs_err(fs_info,
+-			  "zoned: failed to read device zone info: %d",
+-			  ret);
++			  "zoned: failed to read device zone info: %d", ret);
+ 		goto fail_block_groups;
+ 	}
+ 
+@@ -3654,19 +3643,24 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	    !btrfs_check_rw_degradable(fs_info, NULL)) {
+ 		btrfs_warn(fs_info,
+ 		"writable mount is not allowed due to too many missing devices");
++		ret = -EINVAL;
+ 		goto fail_sysfs;
+ 	}
+ 
+ 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
+ 					       "btrfs-cleaner");
+-	if (IS_ERR(fs_info->cleaner_kthread))
++	if (IS_ERR(fs_info->cleaner_kthread)) {
++		ret = PTR_ERR(fs_info->cleaner_kthread);
+ 		goto fail_sysfs;
++	}
+ 
+ 	fs_info->transaction_kthread = kthread_run(transaction_kthread,
+ 						   tree_root,
+ 						   "btrfs-transaction");
+-	if (IS_ERR(fs_info->transaction_kthread))
++	if (IS_ERR(fs_info->transaction_kthread)) {
++		ret = PTR_ERR(fs_info->transaction_kthread);
+ 		goto fail_cleaner;
++	}
+ 
+ 	if (!btrfs_test_opt(fs_info, NOSSD) &&
+ 	    !fs_info->fs_devices->rotating) {
+@@ -3684,7 +3678,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	    fs_info->fs_devices->discardable) {
+ 		btrfs_set_and_info(fs_info, DISCARD_ASYNC,
+ 				   "auto enabling async discard");
+-		btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
+ 	}
+ 
+ #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+@@ -3711,16 +3704,14 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ 		btrfs_info(fs_info, "start tree-log replay");
+ 		ret = btrfs_replay_log(fs_info, fs_devices);
+-		if (ret) {
+-			err = ret;
++		if (ret)
+ 			goto fail_qgroup;
+-		}
+ 	}
+ 
+ 	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
+ 	if (IS_ERR(fs_info->fs_root)) {
+-		err = PTR_ERR(fs_info->fs_root);
+-		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
++		ret = PTR_ERR(fs_info->fs_root);
++		btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
+ 		fs_info->fs_root = NULL;
+ 		goto fail_qgroup;
+ 	}
+@@ -3797,7 +3788,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
+ 	iput(fs_info->btree_inode);
+ fail:
+ 	btrfs_close_devices(fs_info->fs_devices);
+-	return err;
++	ASSERT(ret < 0);
++	return ret;
+ }
+ ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
+ 
+@@ -4094,6 +4086,8 @@ static void write_dev_flush(struct btrfs_device *device)
+ {
+ 	struct bio *bio = &device->flush_bio;
+ 
++	device->last_flush_error = BLK_STS_OK;
++
+ #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ 	/*
+ 	 * When a disk has write caching disabled, we skip submission of a bio
+@@ -4122,25 +4116,24 @@ static void write_dev_flush(struct btrfs_device *device)
+ 
+ /*
+  * If the flush bio has been submitted by write_dev_flush, wait for it.
++ * Return true for any error, and false otherwise.
+  */
+-static blk_status_t wait_dev_flush(struct btrfs_device *device)
++static bool wait_dev_flush(struct btrfs_device *device)
+ {
+ 	struct bio *bio = &device->flush_bio;
+ 
+-	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
+-		return BLK_STS_OK;
++	if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
++		return false;
+ 
+-	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
+ 	wait_for_completion_io(&device->flush_wait);
+ 
+-	return bio->bi_status;
+-}
++	if (bio->bi_status) {
++		device->last_flush_error = bio->bi_status;
++		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
++		return true;
++	}
+ 
+-static int check_barrier_error(struct btrfs_fs_info *fs_info)
+-{
+-	if (!btrfs_check_rw_degradable(fs_info, NULL))
+-		return -EIO;
+-	return 0;
++	return false;
+ }
+ 
+ /*
+@@ -4152,7 +4145,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
+ 	struct list_head *head;
+ 	struct btrfs_device *dev;
+ 	int errors_wait = 0;
+-	blk_status_t ret;
+ 
+ 	lockdep_assert_held(&info->fs_devices->device_list_mutex);
+ 	/* send down all the barriers */
+@@ -4167,7 +4159,6 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
+ 			continue;
+ 
+ 		write_dev_flush(dev);
+-		dev->last_flush_error = BLK_STS_OK;
+ 	}
+ 
+ 	/* wait for all the barriers */
+@@ -4182,23 +4173,17 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
+ 		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
+ 			continue;
+ 
+-		ret = wait_dev_flush(dev);
+-		if (ret) {
+-			dev->last_flush_error = ret;
+-			btrfs_dev_stat_inc_and_print(dev,
+-					BTRFS_DEV_STAT_FLUSH_ERRS);
++		if (wait_dev_flush(dev))
+ 			errors_wait++;
+-		}
+ 	}
+ 
+-	if (errors_wait) {
+-		/*
+-		 * At some point we need the status of all disks
+-		 * to arrive at the volume status. So error checking
+-		 * is being pushed to a separate loop.
+-		 */
+-		return check_barrier_error(info);
+-	}
++	/*
++	 * Checks last_flush_error of disks in order to determine the device
++	 * state.
++	 */
++	if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
++		return -EIO;
++
+ 	return 0;
+ }
+ 
+@@ -4404,12 +4389,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
+ 			root_objectid = gang[i]->root_key.objectid;
+ 			err = btrfs_orphan_cleanup(gang[i]);
+ 			if (err)
+-				break;
++				goto out;
+ 			btrfs_put_root(gang[i]);
+ 		}
+ 		root_objectid++;
+ 	}
+-
++out:
+ 	/* release the uncleaned roots due to error */
+ 	for (; i < ret; i++) {
+ 		if (gang[i])
+diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
+index 824c657f59e8..5cd289de4e92 100644
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -1894,8 +1894,7 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ }
+ 
+ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+-				    struct btrfs_delayed_ref_head *locked_ref,
+-				    unsigned long *run_refs)
++					   struct btrfs_delayed_ref_head *locked_ref)
+ {
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_delayed_ref_root *delayed_refs;
+@@ -1917,7 +1916,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ 			return -EAGAIN;
+ 		}
+ 
+-		(*run_refs)++;
+ 		ref->in_tree = 0;
+ 		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
+ 		RB_CLEAR_NODE(&ref->ref_node);
+@@ -1981,10 +1979,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 	struct btrfs_fs_info *fs_info = trans->fs_info;
+ 	struct btrfs_delayed_ref_root *delayed_refs;
+ 	struct btrfs_delayed_ref_head *locked_ref = NULL;
+-	ktime_t start = ktime_get();
+ 	int ret;
+ 	unsigned long count = 0;
+-	unsigned long actual_count = 0;
+ 
+ 	delayed_refs = &trans->transaction->delayed_refs;
+ 	do {
+@@ -2014,8 +2010,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 		spin_lock(&locked_ref->lock);
+ 		btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref);
+ 
+-		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+-						      &actual_count);
++		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref);
+ 		if (ret < 0 && ret != -EAGAIN) {
+ 			/*
+ 			 * Error, btrfs_run_delayed_refs_for_head already
+@@ -2046,24 +2041,6 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ 		cond_resched();
+ 	} while ((nr != -1 && count < nr) || locked_ref);
+ 
+-	/*
+-	 * We don't want to include ref heads since we can have empty ref heads
+-	 * and those will drastically skew our runtime down since we just do
+-	 * accounting, no actual extent tree updates.
+-	 */
+-	if (actual_count > 0) {
+-		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
+-		u64 avg;
+-
+-		/*
+-		 * We weigh the current average higher than our current runtime
+-		 * to avoid large swings in the average.
+-		 */
+-		spin_lock(&delayed_refs->lock);
+-		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
+-		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
+-		spin_unlock(&delayed_refs->lock);
+-	}
+ 	return 0;
+ }
+ 
+@@ -5509,11 +5486,11 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ {
+ 	int level = wc->level;
+ 	int lookup_info = 1;
+-	int ret;
++	int ret = 0;
+ 
+ 	while (level >= 0) {
+ 		ret = walk_down_proc(trans, root, path, wc, lookup_info);
+-		if (ret > 0)
++		if (ret)
+ 			break;
+ 
+ 		if (level == 0)
+@@ -5528,10 +5505,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+ 			path->slots[level]++;
+ 			continue;
+ 		} else if (ret < 0)
+-			return ret;
++			break;
+ 		level = wc->level;
+ 	}
+-	return 0;
++	return (ret == 1) ? 0 : ret;
+ }
+ 
+ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
+@@ -5708,12 +5685,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
+ 
+ 		ret = walk_down_tree(trans, root, path, wc);
+ 		if (ret < 0) {
++			btrfs_abort_transaction(trans, ret);
+ 			err = ret;
+ 			break;
+ 		}
+ 
+ 		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
+ 		if (ret < 0) {
++			btrfs_abort_transaction(trans, ret);
+ 			err = ret;
+ 			break;
+ 		}
+diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
+index 40300e8e5f99..a1adadd5d25d 100644
+--- a/fs/btrfs/extent_io.c
++++ b/fs/btrfs/extent_io.c
+@@ -97,11 +97,13 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
+  * how many bytes are there before stripe/ordered extent boundary.
+  */
+ struct btrfs_bio_ctrl {
+-	struct bio *bio;
++	struct btrfs_bio *bbio;
+ 	int mirror_num;
+ 	enum btrfs_compression_type compress_type;
+ 	u32 len_to_oe_boundary;
++	blk_opf_t opf;
+ 	btrfs_bio_end_io_t end_io_func;
++	struct writeback_control *wbc;
+ 
+ 	/*
+ 	 * This is for metadata read, to provide the extra needed verification
+@@ -117,51 +119,41 @@ struct btrfs_bio_ctrl {
+ 	 * does the unlocking.
+ 	 */
+ 	bool extent_locked;
+-
+-	/* Tell the submit_bio code to use REQ_SYNC */
+-	bool sync_io;
+ };
+ 
+ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+ {
+-	struct bio *bio;
+-	struct bio_vec *bv;
+-	struct inode *inode;
+-	int mirror_num;
++	struct btrfs_bio *bbio = bio_ctrl->bbio;
++	int mirror_num = bio_ctrl->mirror_num;
+ 
+-	if (!bio_ctrl->bio)
++	if (!bbio)
+ 		return;
+ 
+-	bio = bio_ctrl->bio;
+-	bv = bio_first_bvec_all(bio);
+-	inode = bv->bv_page->mapping->host;
+-	mirror_num = bio_ctrl->mirror_num;
+-
+ 	/* Caller should ensure the bio has at least some range added */
+-	ASSERT(bio->bi_iter.bi_size);
++	ASSERT(bbio->bio.bi_iter.bi_size);
+ 
+-	if (!is_data_inode(inode)) {
+-		if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
++	if (!is_data_inode(&bbio->inode->vfs_inode)) {
++		if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE) {
+ 			/*
+ 			 * For metadata read, we should have the parent_check,
+ 			 * and copy it to bbio for metadata verification.
+ 			 */
+ 			ASSERT(bio_ctrl->parent_check);
+-			memcpy(&btrfs_bio(bio)->parent_check,
++			memcpy(&bbio->parent_check,
+ 			       bio_ctrl->parent_check,
+ 			       sizeof(struct btrfs_tree_parent_check));
+ 		}
+-		bio->bi_opf |= REQ_META;
++		bbio->bio.bi_opf |= REQ_META;
+ 	}
+ 
+-	if (btrfs_op(bio) == BTRFS_MAP_READ &&
++	if (btrfs_op(&bbio->bio) == BTRFS_MAP_READ &&
+ 	    bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
+-		btrfs_submit_compressed_read(inode, bio, mirror_num);
++		btrfs_submit_compressed_read(bbio, mirror_num);
+ 	else
+-		btrfs_submit_bio(bio, mirror_num);
++		btrfs_submit_bio(bbio, mirror_num);
+ 
+-	/* The bio is owned by the end_io handler now */
+-	bio_ctrl->bio = NULL;
++	/* The bbio is owned by the end_io handler now */
++	bio_ctrl->bbio = NULL;
+ }
+ 
+ /*
+@@ -169,16 +161,16 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
+  */
+ static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret)
+ {
+-	struct bio *bio = bio_ctrl->bio;
++	struct btrfs_bio *bbio = bio_ctrl->bbio;
+ 
+-	if (!bio)
++	if (!bbio)
+ 		return;
+ 
+ 	if (ret) {
+ 		ASSERT(ret < 0);
+-		btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
++		btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
+ 		/* The bio is owned by the end_io handler now */
+-		bio_ctrl->bio = NULL;
++		bio_ctrl->bbio = NULL;
+ 	} else {
+ 		submit_one_bio(bio_ctrl);
+ 	}
+@@ -867,89 +859,52 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+ 	return 0;
+ }
+ 
+-/*
+- * Attempt to add a page to bio.
+- *
+- * @bio_ctrl:       record both the bio, and its bio_flags
+- * @page:	    page to add to the bio
+- * @disk_bytenr:    offset of the new bio or to check whether we are adding
+- *                  a contiguous page to the previous one
+- * @size:	    portion of page that we want to write
+- * @pg_offset:	    starting offset in the page
+- * @compress_type:  compression type of the current bio to see if we can merge them
+- *
+- * Attempt to add a page to bio considering stripe alignment etc.
+- *
+- * Return >= 0 for the number of bytes added to the bio.
+- * Can return 0 if the current bio is already at stripe/zone boundary.
+- * Return <0 for error.
+- */
+-static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
+-			      struct page *page,
+-			      u64 disk_bytenr, unsigned int size,
+-			      unsigned int pg_offset,
+-			      enum btrfs_compression_type compress_type)
++static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
++				struct page *page, u64 disk_bytenr,
++				unsigned int pg_offset)
+ {
+-	struct bio *bio = bio_ctrl->bio;
+-	u32 bio_size = bio->bi_iter.bi_size;
+-	u32 real_size;
++	struct bio *bio = &bio_ctrl->bbio->bio;
++	struct bio_vec *bvec = bio_last_bvec_all(bio);
+ 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
+-	bool contig = false;
+ 
+-	ASSERT(bio);
+-	/* The limit should be calculated when bio_ctrl->bio is allocated */
+-	ASSERT(bio_ctrl->len_to_oe_boundary);
+-	if (bio_ctrl->compress_type != compress_type)
+-		return 0;
+-
+-
+-	if (bio->bi_iter.bi_size == 0) {
+-		/* We can always add a page into an empty bio. */
+-		contig = true;
+-	} else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
+-		struct bio_vec *bvec = bio_last_bvec_all(bio);
+-
+-		/*
+-		 * The contig check requires the following conditions to be met:
+-		 * 1) The pages are belonging to the same inode
+-		 *    This is implied by the call chain.
+-		 *
+-		 * 2) The range has adjacent logical bytenr
+-		 *
+-		 * 3) The range has adjacent file offset
+-		 *    This is required for the usage of btrfs_bio->file_offset.
+-		 */
+-		if (bio_end_sector(bio) == sector &&
+-		    page_offset(bvec->bv_page) + bvec->bv_offset +
+-		    bvec->bv_len == page_offset(page) + pg_offset)
+-			contig = true;
+-	} else {
++	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
+ 		/*
+-		 * For compression, all IO should have its logical bytenr
+-		 * set to the starting bytenr of the compressed extent.
++		 * For compression, all IO should have its logical bytenr set
++		 * to the starting bytenr of the compressed extent.
+ 		 */
+-		contig = bio->bi_iter.bi_sector == sector;
++		return bio->bi_iter.bi_sector == sector;
+ 	}
+ 
+-	if (!contig)
+-		return 0;
+-
+-	real_size = min(bio_ctrl->len_to_oe_boundary - bio_size, size);
+-
+ 	/*
+-	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
+-	 * bio will still execute its endio function on the page!
++	 * The contig check requires the following conditions to be met:
++	 *
++	 * 1) The pages are belonging to the same inode
++	 *    This is implied by the call chain.
++	 *
++	 * 2) The range has adjacent logical bytenr
++	 *
++	 * 3) The range has adjacent file offset
++	 *    This is required for the usage of btrfs_bio->file_offset.
+ 	 */
+-	if (real_size == 0)
+-		return 0;
+-
+-	return bio_add_page(bio, page, real_size, pg_offset);
++	return bio_end_sector(bio) == sector &&
++		page_offset(bvec->bv_page) + bvec->bv_offset + bvec->bv_len ==
++		page_offset(page) + pg_offset;
+ }
+ 
+-static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+-				struct btrfs_inode *inode, u64 file_offset)
++static void alloc_new_bio(struct btrfs_inode *inode,
++			  struct btrfs_bio_ctrl *bio_ctrl,
++			  u64 disk_bytenr, u64 file_offset)
+ {
+-	struct btrfs_ordered_extent *ordered;
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct btrfs_bio *bbio;
++
++	bbio = btrfs_bio_alloc(BIO_MAX_VECS, bio_ctrl->opf, fs_info,
++			       bio_ctrl->end_io_func, NULL);
++	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
++	bbio->inode = inode;
++	bbio->file_offset = file_offset;
++	bio_ctrl->bbio = bbio;
++	bio_ctrl->len_to_oe_boundary = U32_MAX;
+ 
+ 	/*
+ 	 * Limit the extent to the ordered boundary for Zone Append.
+@@ -957,132 +912,89 @@ static void calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
+ 	 * them.
+ 	 */
+ 	if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE &&
+-	    btrfs_use_zone_append(btrfs_bio(bio_ctrl->bio))) {
++	    btrfs_use_zone_append(bbio)) {
++		struct btrfs_ordered_extent *ordered;
++
+ 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
+ 		if (ordered) {
+ 			bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
+ 					ordered->file_offset +
+ 					ordered->disk_num_bytes - file_offset);
+ 			btrfs_put_ordered_extent(ordered);
+-			return;
+ 		}
+ 	}
+ 
+-	bio_ctrl->len_to_oe_boundary = U32_MAX;
+-}
+-
+-static void alloc_new_bio(struct btrfs_inode *inode,
+-			  struct btrfs_bio_ctrl *bio_ctrl,
+-			  struct writeback_control *wbc, blk_opf_t opf,
+-			  u64 disk_bytenr, u32 offset, u64 file_offset,
+-			  enum btrfs_compression_type compress_type)
+-{
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct bio *bio;
+-
+-	bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, inode, bio_ctrl->end_io_func,
+-			      NULL);
+-	/*
+-	 * For compressed page range, its disk_bytenr is always @disk_bytenr
+-	 * passed in, no matter if we have added any range into previous bio.
+-	 */
+-	if (compress_type != BTRFS_COMPRESS_NONE)
+-		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
+-	else
+-		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
+-	btrfs_bio(bio)->file_offset = file_offset;
+-	bio_ctrl->bio = bio;
+-	bio_ctrl->compress_type = compress_type;
+-	calc_bio_boundaries(bio_ctrl, inode, file_offset);
+-
+-	if (wbc) {
++	if (bio_ctrl->wbc) {
+ 		/*
+ 		 * Pick the last added device to support cgroup writeback.  For
+ 		 * multi-device file systems this means blk-cgroup policies have
+ 		 * to always be set on the last added/replaced device.
+ 		 * This is a bit odd but has been like that for a long time.
+ 		 */
+-		bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
+-		wbc_init_bio(wbc, bio);
++		bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
++		wbc_init_bio(bio_ctrl->wbc, &bbio->bio);
+ 	}
+ }
+ 
+ /*
+- * @opf:	bio REQ_OP_* and REQ_* flags as one value
+- * @wbc:	optional writeback control for io accounting
+  * @disk_bytenr: logical bytenr where the write will be
+  * @page:	page to add to the bio
+  * @size:	portion of page that we want to write to
+  * @pg_offset:	offset of the new bio or to check whether we are adding
+  *              a contiguous page to the previous one
+- * @compress_type:   compress type for current bio
+  *
+- * The will either add the page into the existing @bio_ctrl->bio, or allocate a
+- * new one in @bio_ctrl->bio.
++ * The will either add the page into the existing @bio_ctrl->bbio, or allocate a
++ * new one in @bio_ctrl->bbio.
+  * The mirror number for this IO should already be initizlied in
+  * @bio_ctrl->mirror_num.
+  */
+-static int submit_extent_page(blk_opf_t opf,
+-			      struct writeback_control *wbc,
+-			      struct btrfs_bio_ctrl *bio_ctrl,
+-			      u64 disk_bytenr, struct page *page,
+-			      size_t size, unsigned long pg_offset,
+-			      enum btrfs_compression_type compress_type,
+-			      bool force_bio_submit)
++static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
++			       u64 disk_bytenr, struct page *page,
++			       size_t size, unsigned long pg_offset)
+ {
+ 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+-	unsigned int cur = pg_offset;
+-
+-	ASSERT(bio_ctrl);
+-
+-	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
+-	       pg_offset + size <= PAGE_SIZE);
+ 
++	ASSERT(pg_offset + size <= PAGE_SIZE);
+ 	ASSERT(bio_ctrl->end_io_func);
+ 
+-	if (force_bio_submit)
++	if (bio_ctrl->bbio &&
++	    !btrfs_bio_is_contig(bio_ctrl, page, disk_bytenr, pg_offset))
+ 		submit_one_bio(bio_ctrl);
+ 
+-	while (cur < pg_offset + size) {
+-		u32 offset = cur - pg_offset;
+-		int added;
++	do {
++		u32 len = size;
+ 
+ 		/* Allocate new bio if needed */
+-		if (!bio_ctrl->bio) {
+-			alloc_new_bio(inode, bio_ctrl, wbc, opf, disk_bytenr,
+-				      offset, page_offset(page) + cur,
+-				      compress_type);
++		if (!bio_ctrl->bbio) {
++			alloc_new_bio(inode, bio_ctrl, disk_bytenr,
++				      page_offset(page) + pg_offset);
+ 		}
+-		/*
+-		 * We must go through btrfs_bio_add_page() to ensure each
+-		 * page range won't cross various boundaries.
+-		 */
+-		if (compress_type != BTRFS_COMPRESS_NONE)
+-			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
+-					size - offset, pg_offset + offset,
+-					compress_type);
+-		else
+-			added = btrfs_bio_add_page(bio_ctrl, page,
+-					disk_bytenr + offset, size - offset,
+-					pg_offset + offset, compress_type);
+-
+-		/* Metadata page range should never be split */
+-		if (!is_data_inode(&inode->vfs_inode))
+-			ASSERT(added == 0 || added == size - offset);
+-
+-		/* At least we added some page, update the account */
+-		if (wbc && added)
+-			wbc_account_cgroup_owner(wbc, page, added);
+-
+-		/* We have reached boundary, submit right now */
+-		if (added < size - offset) {
+-			/* The bio should contain some page(s) */
+-			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
++
++		/* Cap to the current ordered extent boundary if there is one. */
++		if (len > bio_ctrl->len_to_oe_boundary) {
++			ASSERT(bio_ctrl->compress_type == BTRFS_COMPRESS_NONE);
++			ASSERT(is_data_inode(&inode->vfs_inode));
++			len = bio_ctrl->len_to_oe_boundary;
++		}
++
++		if (bio_add_page(&bio_ctrl->bbio->bio, page, len, pg_offset) != len) {
++			/* bio full: move on to a new one */
+ 			submit_one_bio(bio_ctrl);
++			continue;
+ 		}
+-		cur += added;
+-	}
+-	return 0;
++
++		if (bio_ctrl->wbc)
++			wbc_account_cgroup_owner(bio_ctrl->wbc, page, len);
++
++		size -= len;
++		pg_offset += len;
++		disk_bytenr += len;
++		bio_ctrl->len_to_oe_boundary -= len;
++
++		/* Ordered extent boundary: move on to a new bio. */
++		if (bio_ctrl->len_to_oe_boundary == 0)
++			submit_one_bio(bio_ctrl);
++	} while (size);
+ }
+ 
+ static int attach_extent_buffer_page(struct extent_buffer *eb,
+@@ -1193,8 +1105,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+  * return 0 on success, otherwise return error
+  */
+ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+-		      struct btrfs_bio_ctrl *bio_ctrl,
+-		      blk_opf_t read_flags, u64 *prev_em_start)
++		      struct btrfs_bio_ctrl *bio_ctrl, u64 *prev_em_start)
+ {
+ 	struct inode *inode = page->mapping->host;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+@@ -1216,7 +1127,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ 		unlock_extent(tree, start, end, NULL);
+ 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
+ 		unlock_page(page);
+-		goto out;
++		return ret;
+ 	}
+ 
+ 	if (page->index == last_byte >> PAGE_SHIFT) {
+@@ -1230,7 +1141,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ 	bio_ctrl->end_io_func = end_bio_extent_readpage;
+ 	begin_page_read(fs_info, page);
+ 	while (cur <= end) {
+-		unsigned long this_bio_flag = 0;
++		enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
+ 		bool force_bio_submit = false;
+ 		u64 disk_bytenr;
+ 
+@@ -1247,19 +1158,18 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ 		if (IS_ERR(em)) {
+ 			unlock_extent(tree, cur, end, NULL);
+ 			end_page_read(page, false, cur, end + 1 - cur);
+-			ret = PTR_ERR(em);
+-			break;
++			return PTR_ERR(em);
+ 		}
+ 		extent_offset = cur - em->start;
+ 		BUG_ON(extent_map_end(em) <= cur);
+ 		BUG_ON(end < cur);
+ 
+ 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+-			this_bio_flag = em->compress_type;
++			compress_type = em->compress_type;
+ 
+ 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
+ 		iosize = ALIGN(iosize, blocksize);
+-		if (this_bio_flag != BTRFS_COMPRESS_NONE)
++		if (compress_type != BTRFS_COMPRESS_NONE)
+ 			disk_bytenr = em->block_start;
+ 		else
+ 			disk_bytenr = em->block_start + extent_offset;
+@@ -1331,24 +1241,20 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
+ 			continue;
+ 		}
+ 
+-		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
+-					 bio_ctrl, disk_bytenr, page, iosize,
+-					 pg_offset, this_bio_flag,
+-					 force_bio_submit);
+-		if (ret) {
+-			/*
+-			 * We have to unlock the remaining range, or the page
+-			 * will never be unlocked.
+-			 */
+-			unlock_extent(tree, cur, end, NULL);
+-			end_page_read(page, false, cur, end + 1 - cur);
+-			goto out;
++		if (bio_ctrl->compress_type != compress_type) {
++			submit_one_bio(bio_ctrl);
++			bio_ctrl->compress_type = compress_type;
+ 		}
++
++		if (force_bio_submit)
++			submit_one_bio(bio_ctrl);
++		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
++				   pg_offset);
+ 		cur = cur + iosize;
+ 		pg_offset += iosize;
+ 	}
+-out:
+-	return ret;
++
++	return 0;
+ }
+ 
+ int btrfs_read_folio(struct file *file, struct folio *folio)
+@@ -1357,12 +1263,12 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
+ 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
+ 	u64 start = page_offset(page);
+ 	u64 end = start + PAGE_SIZE - 1;
+-	struct btrfs_bio_ctrl bio_ctrl = { 0 };
++	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ };
+ 	int ret;
+ 
+ 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
+ 
+-	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
++	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, NULL);
+ 	/*
+ 	 * If btrfs_do_readpage() failed we will want to submit the assembled
+ 	 * bio to do the cleanup.
+@@ -1384,7 +1290,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages,
+ 
+ 	for (index = 0; index < nr_pages; index++) {
+ 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
+-				  REQ_RAHEAD, prev_em_start);
++				  prev_em_start);
+ 		put_page(pages[index]);
+ 	}
+ }
+@@ -1520,7 +1426,6 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
+  */
+ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 				 struct page *page,
+-				 struct writeback_control *wbc,
+ 				 struct btrfs_bio_ctrl *bio_ctrl,
+ 				 loff_t i_size,
+ 				 int *nr_ret)
+@@ -1531,18 +1436,14 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 	u64 extent_offset;
+ 	u64 block_start;
+ 	struct extent_map *em;
+-	int saved_ret = 0;
+ 	int ret = 0;
+ 	int nr = 0;
+-	enum req_op op = REQ_OP_WRITE;
+-	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
+-	bool has_error = false;
+ 	bool compressed;
+ 
+ 	ret = btrfs_writepage_cow_fixup(page);
+ 	if (ret) {
+ 		/* Fixup worker will requeue */
+-		redirty_page_for_writepage(wbc, page);
++		redirty_page_for_writepage(bio_ctrl->wbc, page);
+ 		unlock_page(page);
+ 		return 1;
+ 	}
+@@ -1551,7 +1452,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 	 * we don't want to touch the inode after unlocking the page,
+ 	 * so we update the mapping writeback index now
+ 	 */
+-	wbc->nr_to_write--;
++	bio_ctrl->wbc->nr_to_write--;
+ 
+ 	bio_ctrl->end_io_func = end_bio_extent_writepage;
+ 	while (cur <= end) {
+@@ -1587,10 +1488,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 		if (IS_ERR(em)) {
+ 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
+ 			ret = PTR_ERR_OR_ZERO(em);
+-			has_error = true;
+-			if (!saved_ret)
+-				saved_ret = ret;
+-			break;
++			goto out_error;
+ 		}
+ 
+ 		extent_offset = cur - em->start;
+@@ -1642,33 +1540,21 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+ 		 */
+ 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ 
+-		ret = submit_extent_page(op | write_flags, wbc,
+-					 bio_ctrl, disk_bytenr,
+-					 page, iosize,
+-					 cur - page_offset(page),
+-					 0, false);
+-		if (ret) {
+-			has_error = true;
+-			if (!saved_ret)
+-				saved_ret = ret;
+-
+-			btrfs_page_set_error(fs_info, page, cur, iosize);
+-			if (PageWriteback(page))
+-				btrfs_page_clear_writeback(fs_info, page, cur,
+-							   iosize);
+-		}
+-
++		submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
++				   cur - page_offset(page));
+ 		cur += iosize;
+ 		nr++;
+ 	}
++
++	btrfs_page_assert_not_dirty(fs_info, page);
++	*nr_ret = nr;
++	return 0;
++
++out_error:
+ 	/*
+ 	 * If we finish without problem, we should not only clear page dirty,
+ 	 * but also empty subpage dirty bits
+ 	 */
+-	if (!has_error)
+-		btrfs_page_assert_not_dirty(fs_info, page);
+-	else
+-		ret = saved_ret;
+ 	*nr_ret = nr;
+ 	return ret;
+ }
+@@ -1682,8 +1568,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
+  * Return 0 if everything goes well.
+  * Return <0 for error.
+  */
+-static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+-			      struct btrfs_bio_ctrl *bio_ctrl)
++static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
+ {
+ 	struct folio *folio = page_folio(page);
+ 	struct inode *inode = page->mapping->host;
+@@ -1696,7 +1581,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	loff_t i_size = i_size_read(inode);
+ 	unsigned long end_index = i_size >> PAGE_SHIFT;
+ 
+-	trace___extent_writepage(page, inode, wbc);
++	trace___extent_writepage(page, inode, bio_ctrl->wbc);
+ 
+ 	WARN_ON(!PageLocked(page));
+ 
+@@ -1721,15 +1606,14 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	}
+ 
+ 	if (!bio_ctrl->extent_locked) {
+-		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
++		ret = writepage_delalloc(BTRFS_I(inode), page, bio_ctrl->wbc);
+ 		if (ret == 1)
+ 			return 0;
+ 		if (ret)
+ 			goto done;
+ 	}
+ 
+-	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size,
+-				    &nr);
++	ret = __extent_writepage_io(BTRFS_I(inode), page, bio_ctrl, i_size, &nr);
+ 	if (ret == 1)
+ 		return 0;
+ 
+@@ -1773,6 +1657,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
+ 	if (PageError(page))
+ 		end_extent_writepage(page, ret, page_start, page_end);
+ 	if (bio_ctrl->extent_locked) {
++		struct writeback_control *wbc = bio_ctrl->wbc;
++
+ 		/*
+ 		 * If bio_ctrl->extent_locked, it's from extent_write_locked_range(),
+ 		 * the page can either be locked by lock_page() or
+@@ -1828,7 +1714,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
+ 
+ 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+ 		btrfs_tree_unlock(eb);
+-		if (!bio_ctrl->sync_io)
++		if (bio_ctrl->wbc->sync_mode != WB_SYNC_ALL)
+ 			return 0;
+ 		if (!flush) {
+ 			submit_write_bio(bio_ctrl, 0);
+@@ -2113,15 +1999,12 @@ static void prepare_eb_write(struct extent_buffer *eb)
+  * Unlike the work in write_one_eb(), we rely completely on extent locking.
+  * Page locking is only utilized at minimum to keep the VMM code happy.
+  */
+-static int write_one_subpage_eb(struct extent_buffer *eb,
+-				struct writeback_control *wbc,
+-				struct btrfs_bio_ctrl *bio_ctrl)
++static void write_one_subpage_eb(struct extent_buffer *eb,
++				 struct btrfs_bio_ctrl *bio_ctrl)
+ {
+ 	struct btrfs_fs_info *fs_info = eb->fs_info;
+ 	struct page *page = eb->pages[0];
+-	blk_opf_t write_flags = wbc_to_write_flags(wbc);
+ 	bool no_dirty_ebs = false;
+-	int ret;
+ 
+ 	prepare_eb_write(eb);
+ 
+@@ -2137,36 +2020,22 @@ static int write_one_subpage_eb(struct extent_buffer *eb,
+ 
+ 	bio_ctrl->end_io_func = end_bio_subpage_eb_writepage;
+ 
+-	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+-			bio_ctrl, eb->start, page, eb->len,
+-			eb->start - page_offset(page), 0, false);
+-	if (ret) {
+-		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
+-		set_btree_ioerr(page, eb);
+-		unlock_page(page);
+-
+-		if (atomic_dec_and_test(&eb->io_pages))
+-			end_extent_buffer_writeback(eb);
+-		return -EIO;
+-	}
++	submit_extent_page(bio_ctrl, eb->start, page, eb->len,
++			   eb->start - page_offset(page));
+ 	unlock_page(page);
+ 	/*
+ 	 * Submission finished without problem, if no range of the page is
+ 	 * dirty anymore, we have submitted a page.  Update nr_written in wbc.
+ 	 */
+ 	if (no_dirty_ebs)
+-		wbc->nr_to_write--;
+-	return ret;
++		bio_ctrl->wbc->nr_to_write--;
+ }
+ 
+-static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+-			struct writeback_control *wbc,
++static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
+ 			struct btrfs_bio_ctrl *bio_ctrl)
+ {
+ 	u64 disk_bytenr = eb->start;
+ 	int i, num_pages;
+-	blk_opf_t write_flags = wbc_to_write_flags(wbc);
+-	int ret = 0;
+ 
+ 	prepare_eb_write(eb);
+ 
+@@ -2178,32 +2047,11 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+ 
+ 		clear_page_dirty_for_io(p);
+ 		set_page_writeback(p);
+-		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
+-					 bio_ctrl, disk_bytenr, p,
+-					 PAGE_SIZE, 0, 0, false);
+-		if (ret) {
+-			set_btree_ioerr(p, eb);
+-			if (PageWriteback(p))
+-				end_page_writeback(p);
+-			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+-				end_extent_buffer_writeback(eb);
+-			ret = -EIO;
+-			break;
+-		}
++		submit_extent_page(bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0);
+ 		disk_bytenr += PAGE_SIZE;
+-		wbc->nr_to_write--;
++		bio_ctrl->wbc->nr_to_write--;
+ 		unlock_page(p);
+ 	}
+-
+-	if (unlikely(ret)) {
+-		for (; i < num_pages; i++) {
+-			struct page *p = eb->pages[i];
+-			clear_page_dirty_for_io(p);
+-			unlock_page(p);
+-		}
+-	}
+-
+-	return ret;
+ }
+ 
+ /*
+@@ -2220,9 +2068,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
+  * Return >=0 for the number of submitted extent buffers.
+  * Return <0 for fatal error.
+  */
+-static int submit_eb_subpage(struct page *page,
+-			     struct writeback_control *wbc,
+-			     struct btrfs_bio_ctrl *bio_ctrl)
++static int submit_eb_subpage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl)
+ {
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ 	int submitted = 0;
+@@ -2284,10 +2130,8 @@ static int submit_eb_subpage(struct page *page,
+ 			free_extent_buffer(eb);
+ 			goto cleanup;
+ 		}
+-		ret = write_one_subpage_eb(eb, wbc, bio_ctrl);
++		write_one_subpage_eb(eb, bio_ctrl);
+ 		free_extent_buffer(eb);
+-		if (ret < 0)
+-			goto cleanup;
+ 		submitted++;
+ 	}
+ 	return submitted;
+@@ -2318,8 +2162,7 @@ static int submit_eb_subpage(struct page *page,
+  * previous call.
+  * Return <0 for fatal error.
+  */
+-static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+-			  struct btrfs_bio_ctrl *bio_ctrl,
++static int submit_eb_page(struct page *page, struct btrfs_bio_ctrl *bio_ctrl,
+ 			  struct extent_buffer **eb_context)
+ {
+ 	struct address_space *mapping = page->mapping;
+@@ -2331,7 +2174,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ 		return 0;
+ 
+ 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
+-		return submit_eb_subpage(page, wbc, bio_ctrl);
++		return submit_eb_subpage(page, bio_ctrl);
+ 
+ 	spin_lock(&mapping->private_lock);
+ 	if (!PagePrivate(page)) {
+@@ -2364,7 +2207,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ 		 * If for_sync, this hole will be filled with
+ 		 * trasnsaction commit.
+ 		 */
+-		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
++		if (bio_ctrl->wbc->sync_mode == WB_SYNC_ALL &&
++		    !bio_ctrl->wbc->for_sync)
+ 			ret = -EAGAIN;
+ 		else
+ 			ret = 0;
+@@ -2389,10 +2233,8 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
+ 		btrfs_schedule_zone_finish_bg(cache, eb);
+ 		btrfs_put_block_group(cache);
+ 	}
+-	ret = write_one_eb(eb, wbc, bio_ctrl);
++	write_one_eb(eb, bio_ctrl);
+ 	free_extent_buffer(eb);
+-	if (ret < 0)
+-		return ret;
+ 	return 1;
+ }
+ 
+@@ -2401,8 +2243,9 @@ int btree_write_cache_pages(struct address_space *mapping,
+ {
+ 	struct extent_buffer *eb_context = NULL;
+ 	struct btrfs_bio_ctrl bio_ctrl = {
++		.wbc = wbc,
++		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ 		.extent_locked = 0,
+-		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
+ 	};
+ 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+ 	int ret = 0;
+@@ -2445,8 +2288,7 @@ int btree_write_cache_pages(struct address_space *mapping,
+ 		for (i = 0; i < nr_folios; i++) {
+ 			struct folio *folio = fbatch.folios[i];
+ 
+-			ret = submit_eb_page(&folio->page, wbc, &bio_ctrl,
+-					&eb_context);
++			ret = submit_eb_page(&folio->page, &bio_ctrl, &eb_context);
+ 			if (ret == 0)
+ 				continue;
+ 			if (ret < 0) {
+@@ -2529,9 +2371,9 @@ int btree_write_cache_pages(struct address_space *mapping,
+  * existing IO to complete.
+  */
+ static int extent_write_cache_pages(struct address_space *mapping,
+-			     struct writeback_control *wbc,
+ 			     struct btrfs_bio_ctrl *bio_ctrl)
+ {
++	struct writeback_control *wbc = bio_ctrl->wbc;
+ 	struct inode *inode = mapping->host;
+ 	int ret = 0;
+ 	int done = 0;
+@@ -2632,7 +2474,7 @@ static int extent_write_cache_pages(struct address_space *mapping,
+ 				continue;
+ 			}
+ 
+-			ret = __extent_writepage(&folio->page, wbc, bio_ctrl);
++			ret = __extent_writepage(&folio->page, bio_ctrl);
+ 			if (ret < 0) {
+ 				done = 1;
+ 				break;
+@@ -2688,18 +2530,19 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
+ 	u64 cur = start;
+ 	unsigned long nr_pages;
+ 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
+-	struct btrfs_bio_ctrl bio_ctrl = {
+-		.extent_locked = 1,
+-		.sync_io = 1,
+-	};
+ 	struct writeback_control wbc_writepages = {
+ 		.sync_mode	= WB_SYNC_ALL,
+ 		.range_start	= start,
+ 		.range_end	= end + 1,
+-		/* We're called from an async helper function */
+-		.punt_to_cgroup	= 1,
+ 		.no_cgroup_owner = 1,
+ 	};
++	struct btrfs_bio_ctrl bio_ctrl = {
++		.wbc = &wbc_writepages,
++		/* We're called from an async helper function */
++		.opf = REQ_OP_WRITE | REQ_BTRFS_CGROUP_PUNT |
++			wbc_to_write_flags(&wbc_writepages),
++		.extent_locked = 1,
++	};
+ 
+ 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
+ 	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
+@@ -2719,7 +2562,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
+ 		ASSERT(PageLocked(page));
+ 		ASSERT(PageDirty(page));
+ 		clear_page_dirty_for_io(page);
+-		ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl);
++		ret = __extent_writepage(page, &bio_ctrl);
+ 		ASSERT(ret <= 0);
+ 		if (ret < 0) {
+ 			found_error = true;
+@@ -2743,8 +2586,9 @@ int extent_writepages(struct address_space *mapping,
+ 	struct inode *inode = mapping->host;
+ 	int ret = 0;
+ 	struct btrfs_bio_ctrl bio_ctrl = {
++		.wbc = wbc,
++		.opf = REQ_OP_WRITE | wbc_to_write_flags(wbc),
+ 		.extent_locked = 0,
+-		.sync_io = (wbc->sync_mode == WB_SYNC_ALL),
+ 	};
+ 
+ 	/*
+@@ -2752,7 +2596,7 @@ int extent_writepages(struct address_space *mapping,
+ 	 * protect the write pointer updates.
+ 	 */
+ 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
+-	ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl);
++	ret = extent_write_cache_pages(mapping, &bio_ctrl);
+ 	submit_write_bio(&bio_ctrl, ret);
+ 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
+ 	return ret;
+@@ -2760,7 +2604,7 @@ int extent_writepages(struct address_space *mapping,
+ 
+ void extent_readahead(struct readahead_control *rac)
+ {
+-	struct btrfs_bio_ctrl bio_ctrl = { 0 };
++	struct btrfs_bio_ctrl bio_ctrl = { .opf = REQ_OP_READ | REQ_RAHEAD };
+ 	struct page *pagepool[16];
+ 	struct extent_map *em_cached = NULL;
+ 	u64 prev_em_start = (u64)-1;
+@@ -4407,10 +4251,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ 	struct page *page = eb->pages[0];
+ 	struct extent_state *cached_state = NULL;
+ 	struct btrfs_bio_ctrl bio_ctrl = {
++		.opf = REQ_OP_READ,
+ 		.mirror_num = mirror_num,
+ 		.parent_check = check,
+ 	};
+-	int ret = 0;
++	int ret;
+ 
+ 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
+ 	ASSERT(PagePrivate(page));
+@@ -4428,14 +4273,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ 			return ret;
+ 	}
+ 
+-	ret = 0;
+ 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
+ 	    PageUptodate(page) ||
+ 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
+ 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ 		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1,
+ 			      &cached_state);
+-		return ret;
++		return 0;
+ 	}
+ 
+ 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
+@@ -4447,28 +4291,19 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
+ 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
+ 
+ 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
+-	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
+-				 eb->start, page, eb->len,
+-				 eb->start - page_offset(page), 0, true);
+-	if (ret) {
+-		/*
+-		 * In the endio function, if we hit something wrong we will
+-		 * increase the io_pages, so here we need to decrease it for
+-		 * error path.
+-		 */
+-		atomic_dec(&eb->io_pages);
+-	}
++	submit_extent_page(&bio_ctrl, eb->start, page, eb->len,
++			   eb->start - page_offset(page));
+ 	submit_one_bio(&bio_ctrl);
+-	if (ret || wait != WAIT_COMPLETE) {
++	if (wait != WAIT_COMPLETE) {
+ 		free_extent_state(cached_state);
+-		return ret;
++		return 0;
+ 	}
+ 
+ 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1,
+ 			EXTENT_LOCKED, &cached_state);
+ 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
+-		ret = -EIO;
+-	return ret;
++		return -EIO;
++	return 0;
+ }
+ 
+ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+@@ -4476,13 +4311,12 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ {
+ 	int i;
+ 	struct page *page;
+-	int err;
+-	int ret = 0;
+ 	int locked_pages = 0;
+ 	int all_uptodate = 1;
+ 	int num_pages;
+ 	unsigned long num_reads = 0;
+ 	struct btrfs_bio_ctrl bio_ctrl = {
++		.opf = REQ_OP_READ,
+ 		.mirror_num = mirror_num,
+ 		.parent_check = check,
+ 	};
+@@ -4550,27 +4384,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ 		page = eb->pages[i];
+ 
+ 		if (!PageUptodate(page)) {
+-			if (ret) {
+-				atomic_dec(&eb->io_pages);
+-				unlock_page(page);
+-				continue;
+-			}
+-
+ 			ClearPageError(page);
+-			err = submit_extent_page(REQ_OP_READ, NULL,
+-					 &bio_ctrl, page_offset(page), page,
+-					 PAGE_SIZE, 0, 0, false);
+-			if (err) {
+-				/*
+-				 * We failed to submit the bio so it's the
+-				 * caller's responsibility to perform cleanup
+-				 * i.e unlock page/set error bit.
+-				 */
+-				ret = err;
+-				SetPageError(page);
+-				unlock_page(page);
+-				atomic_dec(&eb->io_pages);
+-			}
++			submit_extent_page(&bio_ctrl, page_offset(page), page,
++					   PAGE_SIZE, 0);
+ 		} else {
+ 			unlock_page(page);
+ 		}
+@@ -4578,17 +4394,17 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ 
+ 	submit_one_bio(&bio_ctrl);
+ 
+-	if (ret || wait != WAIT_COMPLETE)
+-		return ret;
++	if (wait != WAIT_COMPLETE)
++		return 0;
+ 
+ 	for (i = 0; i < num_pages; i++) {
+ 		page = eb->pages[i];
+ 		wait_on_page_locked(page);
+ 		if (!PageUptodate(page))
+-			ret = -EIO;
++			return -EIO;
+ 	}
+ 
+-	return ret;
++	return 0;
+ 
+ unlock_exit:
+ 	while (locked_pages > 0) {
+@@ -4596,7 +4412,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
+ 		page = eb->pages[locked_pages];
+ 		unlock_page(page);
+ 	}
+-	return ret;
++	return 0;
+ }
+ 
+ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
+diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
+index 41c77a100853..018c711a0bc8 100644
+--- a/fs/btrfs/file-item.c
++++ b/fs/btrfs/file-item.c
+@@ -335,48 +335,6 @@ static int search_csum_tree(struct btrfs_fs_info *fs_info,
+ 	return ret;
+ }
+ 
+-/*
+- * Locate the file_offset of @cur_disk_bytenr of a @bio.
+- *
+- * Bio of btrfs represents read range of
+- * [bi_sector << 9, bi_sector << 9 + bi_size).
+- * Knowing this, we can iterate through each bvec to locate the page belong to
+- * @cur_disk_bytenr and get the file offset.
+- *
+- * @inode is used to determine if the bvec page really belongs to @inode.
+- *
+- * Return 0 if we can't find the file offset
+- * Return >0 if we find the file offset and restore it to @file_offset_ret
+- */
+-static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
+-				     u64 disk_bytenr, u64 *file_offset_ret)
+-{
+-	struct bvec_iter iter;
+-	struct bio_vec bvec;
+-	u64 cur = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+-	int ret = 0;
+-
+-	bio_for_each_segment(bvec, bio, iter) {
+-		struct page *page = bvec.bv_page;
+-
+-		if (cur > disk_bytenr)
+-			break;
+-		if (cur + bvec.bv_len <= disk_bytenr) {
+-			cur += bvec.bv_len;
+-			continue;
+-		}
+-		ASSERT(in_range(disk_bytenr, cur, bvec.bv_len));
+-		if (page->mapping && page->mapping->host &&
+-		    page->mapping->host == inode) {
+-			ret = 1;
+-			*file_offset_ret = page_offset(page) + bvec.bv_offset +
+-					   disk_bytenr - cur;
+-			break;
+-		}
+-	}
+-	return ret;
+-}
+-
+ /*
+  * Lookup the checksum for the read bio in csum tree.
+  *
+@@ -386,17 +344,15 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+ {
+ 	struct btrfs_inode *inode = bbio->inode;
+ 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+-	struct extent_io_tree *io_tree = &inode->io_tree;
+ 	struct bio *bio = &bbio->bio;
+ 	struct btrfs_path *path;
+ 	const u32 sectorsize = fs_info->sectorsize;
+ 	const u32 csum_size = fs_info->csum_size;
+ 	u32 orig_len = bio->bi_iter.bi_size;
+ 	u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+-	u64 cur_disk_bytenr;
+ 	const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits;
+-	int count = 0;
+ 	blk_status_t ret = BLK_STS_OK;
++	u32 bio_offset = 0;
+ 
+ 	if ((inode->flags & BTRFS_INODE_NODATASUM) ||
+ 	    test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))
+@@ -447,28 +403,14 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+ 		path->skip_locking = 1;
+ 	}
+ 
+-	for (cur_disk_bytenr = orig_disk_bytenr;
+-	     cur_disk_bytenr < orig_disk_bytenr + orig_len;
+-	     cur_disk_bytenr += (count * sectorsize)) {
+-		u64 search_len = orig_disk_bytenr + orig_len - cur_disk_bytenr;
+-		unsigned int sector_offset;
+-		u8 *csum_dst;
+-
+-		/*
+-		 * Although both cur_disk_bytenr and orig_disk_bytenr is u64,
+-		 * we're calculating the offset to the bio start.
+-		 *
+-		 * Bio size is limited to UINT_MAX, thus unsigned int is large
+-		 * enough to contain the raw result, not to mention the right
+-		 * shifted result.
+-		 */
+-		ASSERT(cur_disk_bytenr - orig_disk_bytenr < UINT_MAX);
+-		sector_offset = (cur_disk_bytenr - orig_disk_bytenr) >>
+-				fs_info->sectorsize_bits;
+-		csum_dst = bbio->csum + sector_offset * csum_size;
++	while (bio_offset < orig_len) {
++		int count;
++		u64 cur_disk_bytenr = orig_disk_bytenr + bio_offset;
++		u8 *csum_dst = bbio->csum +
++			(bio_offset >> fs_info->sectorsize_bits) * csum_size;
+ 
+ 		count = search_csum_tree(fs_info, path, cur_disk_bytenr,
+-					 search_len, csum_dst);
++					 orig_len - bio_offset, csum_dst);
+ 		if (count < 0) {
+ 			ret = errno_to_blk_status(count);
+ 			if (bbio->csum != bbio->csum_inline)
+@@ -493,14 +435,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+ 
+ 			if (inode->root->root_key.objectid ==
+ 			    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+-				u64 file_offset;
+-				int ret;
+-
+-				ret = search_file_offset_in_bio(bio,
+-						&inode->vfs_inode,
+-						cur_disk_bytenr, &file_offset);
+-				if (ret)
+-					set_extent_bits(io_tree, file_offset,
++				u64 file_offset = bbio->file_offset + bio_offset;
++
++				set_extent_bits(&inode->io_tree, file_offset,
+ 						file_offset + sectorsize - 1,
+ 						EXTENT_NODATASUM);
+ 			} else {
+@@ -509,6 +446,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio)
+ 				cur_disk_bytenr, cur_disk_bytenr + sectorsize);
+ 			}
+ 		}
++		bio_offset += count * sectorsize;
+ 	}
+ 
+ 	btrfs_free_path(path);
+@@ -659,7 +597,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+  * in is large enough to contain all csums.
+  */
+ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+-			      u8 *csum_buf, unsigned long *csum_bitmap)
++			      u8 *csum_buf, unsigned long *csum_bitmap,
++			      bool search_commit)
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct btrfs_key key;
+@@ -676,6 +615,12 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++	if (search_commit) {
++		path->skip_locking = 1;
++		path->reada = READA_FORWARD;
++		path->search_commit_root = 1;
++	}
++
+ 	key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+ 	key.type = BTRFS_EXTENT_CSUM_KEY;
+ 	key.offset = start;
+diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h
+index cd7f2ae515c0..6be8725cd574 100644
+--- a/fs/btrfs/file-item.h
++++ b/fs/btrfs/file-item.h
+@@ -57,7 +57,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
+ 			    struct list_head *list, int search_commit,
+ 			    bool nowait);
+ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
+-			      u8 *csum_buf, unsigned long *csum_bitmap);
++			      u8 *csum_buf, unsigned long *csum_bitmap,
++			      bool search_commit);
+ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
+ 				     const struct btrfs_path *path,
+ 				     struct btrfs_file_extent_item *fi,
+diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
+index 24cd49229408..0d98fc5f6f44 100644
+--- a/fs/btrfs/fs.h
++++ b/fs/btrfs/fs.h
+@@ -24,6 +24,18 @@
+ #define BTRFS_SUPER_INFO_SIZE			4096
+ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
+ 
++/*
++ * Number of metadata items necessary for an unlink operation:
++ *
++ * 1 for the possible orphan item
++ * 1 for the dir item
++ * 1 for the dir index
++ * 1 for the inode ref
++ * 1 for the inode
++ * 1 for the parent inode
++ */
++#define BTRFS_UNLINK_METADATA_UNITS		6
++
+ /*
+  * The reserved space at the beginning of each device.  It covers the primary
+  * super block and leaves space for potential use by other tools like
+@@ -193,11 +205,7 @@ enum {
+ #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
+ #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
+ 
+-#ifdef CONFIG_BTRFS_DEBUG
+-/*
+- * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
+- */
+-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
++#define BTRFS_FEATURE_INCOMPAT_SUPP_STABLE		\
+ 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+ 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+ 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+@@ -210,23 +218,22 @@ enum {
+ 	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+ 	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
+ 	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
+-	 BTRFS_FEATURE_INCOMPAT_ZONED		|	\
++	 BTRFS_FEATURE_INCOMPAT_ZONED)
++
++#ifdef CONFIG_BTRFS_DEBUG
++	/*
++	 * Features under developmen like Extent tree v2 support is enabled
++	 * only under CONFIG_BTRFS_DEBUG.
++	 */
++#define BTRFS_FEATURE_INCOMPAT_SUPP		\
++	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE |	\
+ 	 BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
++
+ #else
+-#define BTRFS_FEATURE_INCOMPAT_SUPP			\
+-	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
+-	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
+-	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
+-	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD |		\
+-	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+-	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+-	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA |	\
+-	 BTRFS_FEATURE_INCOMPAT_NO_HOLES	|	\
+-	 BTRFS_FEATURE_INCOMPAT_METADATA_UUID	|	\
+-	 BTRFS_FEATURE_INCOMPAT_RAID1C34	|	\
+-	 BTRFS_FEATURE_INCOMPAT_ZONED)
++
++#define BTRFS_FEATURE_INCOMPAT_SUPP		\
++	(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE)
++
+ #endif
+ 
+ #define BTRFS_FEATURE_INCOMPAT_SAFE_SET			\
+@@ -412,7 +419,6 @@ struct btrfs_fs_info {
+ 	 * Must be written and read while holding btrfs_fs_info::commit_root_sem.
+ 	 */
+ 	u64 last_reloc_trans;
+-	u64 avg_delayed_ref_runtime;
+ 
+ 	/*
+ 	 * This is updated to the current trans every time a full commit is
+@@ -638,7 +644,6 @@ struct btrfs_fs_info {
+ 	refcount_t scrub_workers_refcnt;
+ 	struct workqueue_struct *scrub_workers;
+ 	struct workqueue_struct *scrub_wr_completion_workers;
+-	struct workqueue_struct *scrub_parity_workers;
+ 	struct btrfs_subpage_info *subpage_info;
+ 
+ 	struct btrfs_discard_ctl discard_ctl;
+@@ -828,7 +833,7 @@ static inline u64 btrfs_csum_bytes_to_leaves(
+  * Use this if we would be adding new items, as we could split nodes as we cow
+  * down the tree.
+  */
+-static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
++static inline u64 btrfs_calc_insert_metadata_size(const struct btrfs_fs_info *fs_info,
+ 						  unsigned num_items)
+ {
+ 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
+@@ -838,7 +843,7 @@ static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
+  * Doing a truncate or a modification won't result in new nodes or leaves, just
+  * what we need for COW.
+  */
+-static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
++static inline u64 btrfs_calc_metadata_size(const struct btrfs_fs_info *fs_info,
+ 						 unsigned num_items)
+ {
+ 	return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
+diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
+index b65c45b5d681..4c322b720a80 100644
+--- a/fs/btrfs/inode-item.c
++++ b/fs/btrfs/inode-item.c
+@@ -527,7 +527,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 
+ 	while (1) {
+ 		u64 clear_start = 0, clear_len = 0, extent_start = 0;
+-		bool should_throttle = false;
++		bool refill_delayed_refs_rsv = false;
+ 
+ 		fi = NULL;
+ 		leaf = path->nodes[0];
+@@ -660,8 +660,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 				/* No pending yet, add ourselves */
+ 				pending_del_slot = path->slots[0];
+ 				pending_del_nr = 1;
+-			} else if (pending_del_nr &&
+-				   path->slots[0] + 1 == pending_del_slot) {
++			} else if (path->slots[0] + 1 == pending_del_slot) {
+ 				/* Hop on the pending chunk */
+ 				pending_del_nr++;
+ 				pending_del_slot = path->slots[0];
+@@ -686,10 +685,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 				btrfs_abort_transaction(trans, ret);
+ 				break;
+ 			}
+-			if (be_nice) {
+-				if (btrfs_should_throttle_delayed_refs(trans))
+-					should_throttle = true;
+-			}
++			if (be_nice && btrfs_check_space_for_delayed_refs(fs_info))
++				refill_delayed_refs_rsv = true;
+ 		}
+ 
+ 		if (found_type == BTRFS_INODE_ITEM_KEY)
+@@ -697,7 +694,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 
+ 		if (path->slots[0] == 0 ||
+ 		    path->slots[0] != pending_del_slot ||
+-		    should_throttle) {
++		    refill_delayed_refs_rsv) {
+ 			if (pending_del_nr) {
+ 				ret = btrfs_del_items(trans, root, path,
+ 						pending_del_slot,
+@@ -720,7 +717,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
+ 			 * actually allocate, so just bail if we're short and
+ 			 * let the normal reservation dance happen higher up.
+ 			 */
+-			if (should_throttle) {
++			if (refill_delayed_refs_rsv) {
+ 				ret = btrfs_delayed_refs_rsv_refill(fs_info,
+ 							BTRFS_RESERVE_NO_FLUSH);
+ 				if (ret) {
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 957e4d76a7b6..57d070025c7a 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -79,6 +79,7 @@ struct btrfs_iget_args {
+ struct btrfs_dio_data {
+ 	ssize_t submitted;
+ 	struct extent_changeset *data_reserved;
++	struct btrfs_ordered_extent *ordered;
+ 	bool data_space_reserved;
+ 	bool nocow_done;
+ };
+@@ -669,8 +670,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk)
+ again:
+ 	will_compress = 0;
+ 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
+-	nr_pages = min_t(unsigned long, nr_pages,
+-			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
++	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
+ 
+ 	/*
+ 	 * we don't want to send crud past the end of i_size through
+@@ -945,10 +945,9 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
+ 	ret = cow_file_range(inode, locked_page, start, end, &page_started,
+ 			     &nr_written, 0, NULL);
+ 	/* Inline extent inserted, page gets unlocked and everything is done */
+-	if (page_started) {
+-		ret = 0;
+-		goto out;
+-	}
++	if (page_started)
++		return 0;
++
+ 	if (ret < 0) {
+ 		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+ 		if (locked_page) {
+@@ -962,14 +961,11 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
+ 			end_extent_writepage(locked_page, ret, page_start, page_end);
+ 			unlock_page(locked_page);
+ 		}
+-		goto out;
++		return ret;
+ 	}
+ 
+-	ret = extent_write_locked_range(&inode->vfs_inode, start, end);
+ 	/* All pages will be unlocked, including @locked_page */
+-out:
+-	kfree(async_extent);
+-	return ret;
++	return extent_write_locked_range(&inode->vfs_inode, start, end);
+ }
+ 
+ static int submit_one_async_extent(struct btrfs_inode *inode,
+@@ -987,6 +983,9 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
+ 	u64 start = async_extent->start;
+ 	u64 end = async_extent->start + async_extent->ram_size - 1;
+ 
++	if (async_chunk->blkcg_css)
++		kthread_associate_blkcg(async_chunk->blkcg_css);
++
+ 	/*
+ 	 * If async_chunk->locked_page is in the async_extent range, we need to
+ 	 * handle it.
+@@ -1001,8 +1000,10 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
+ 	lock_extent(io_tree, start, end, NULL);
+ 
+ 	/* We have fall back to uncompressed write */
+-	if (!async_extent->pages)
+-		return submit_uncompressed_range(inode, async_extent, locked_page);
++	if (!async_extent->pages) {
++		ret = submit_uncompressed_range(inode, async_extent, locked_page);
++		goto done;
++	}
+ 
+ 	ret = btrfs_reserve_extent(root, async_extent->ram_size,
+ 				   async_extent->compressed_size,
+@@ -1054,24 +1055,18 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
+ 	extent_clear_unlock_delalloc(inode, start, end,
+ 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+ 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
+-	if (btrfs_submit_compressed_write(inode, start,	/* file_offset */
++
++	btrfs_submit_compressed_write(inode, start,	/* file_offset */
+ 			    async_extent->ram_size,	/* num_bytes */
+ 			    ins.objectid,		/* disk_bytenr */
+ 			    ins.offset,			/* compressed_len */
+ 			    async_extent->pages,	/* compressed_pages */
+ 			    async_extent->nr_pages,
+-			    async_chunk->write_flags,
+-			    async_chunk->blkcg_css, true)) {
+-		const u64 start = async_extent->start;
+-		const u64 end = start + async_extent->ram_size - 1;
+-
+-		btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0);
+-
+-		extent_clear_unlock_delalloc(inode, start, end, NULL, 0,
+-					     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+-		free_async_extent_pages(async_extent);
+-	}
++			    async_chunk->write_flags, true);
+ 	*alloc_hint = ins.objectid + ins.offset;
++done:
++	if (async_chunk->blkcg_css)
++		kthread_associate_blkcg(NULL);
+ 	kfree(async_extent);
+ 	return ret;
+ 
+@@ -1086,8 +1081,7 @@ static int submit_one_async_extent(struct btrfs_inode *inode,
+ 				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
+ 				     PAGE_END_WRITEBACK | PAGE_SET_ERROR);
+ 	free_async_extent_pages(async_extent);
+-	kfree(async_extent);
+-	return ret;
++	goto done;
+ }
+ 
+ /*
+@@ -1622,6 +1616,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
+ 		if (blkcg_css != blkcg_root_css) {
+ 			css_get(blkcg_css);
+ 			async_chunk[i].blkcg_css = blkcg_css;
++			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
+ 		} else {
+ 			async_chunk[i].blkcg_css = NULL;
+ 		}
+@@ -2521,37 +2516,31 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
+ }
+ 
+ /*
+- * Split an extent_map at [start, start + len]
++ * Split off the first pre bytes from the extent_map at [start, start + len]
+  *
+  * This function is intended to be used only for extract_ordered_extent().
+  */
+-static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+-			  u64 pre, u64 post)
++static int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre)
+ {
+ 	struct extent_map_tree *em_tree = &inode->extent_tree;
+ 	struct extent_map *em;
+ 	struct extent_map *split_pre = NULL;
+ 	struct extent_map *split_mid = NULL;
+-	struct extent_map *split_post = NULL;
+ 	int ret = 0;
+ 	unsigned long flags;
+ 
+-	/* Sanity check */
+-	if (pre == 0 && post == 0)
+-		return 0;
++	ASSERT(pre != 0);
++	ASSERT(pre < len);
+ 
+ 	split_pre = alloc_extent_map();
+-	if (pre)
+-		split_mid = alloc_extent_map();
+-	if (post)
+-		split_post = alloc_extent_map();
+-	if (!split_pre || (pre && !split_mid) || (post && !split_post)) {
++	if (!split_pre)
++		return -ENOMEM;
++	split_mid = alloc_extent_map();
++	if (!split_mid) {
+ 		ret = -ENOMEM;
+-		goto out;
++		goto out_free_pre;
+ 	}
+ 
+-	ASSERT(pre + post < len);
+-
+ 	lock_extent(&inode->io_tree, start, start + len - 1, NULL);
+ 	write_lock(&em_tree->lock);
+ 	em = lookup_extent_mapping(em_tree, start, len);
+@@ -2572,7 +2561,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ 
+ 	/* First, replace the em with a new extent_map starting from * em->start */
+ 	split_pre->start = em->start;
+-	split_pre->len = (pre ? pre : em->len - post);
++	split_pre->len = pre;
+ 	split_pre->orig_start = split_pre->start;
+ 	split_pre->block_start = em->block_start;
+ 	split_pre->block_len = split_pre->len;
+@@ -2586,38 +2575,21 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ 
+ 	/*
+ 	 * Now we only have an extent_map at:
+-	 *     [em->start, em->start + pre] if pre != 0
+-	 *     [em->start, em->start + em->len - post] if pre == 0
+-	 */
+-
+-	if (pre) {
+-		/* Insert the middle extent_map */
+-		split_mid->start = em->start + pre;
+-		split_mid->len = em->len - pre - post;
+-		split_mid->orig_start = split_mid->start;
+-		split_mid->block_start = em->block_start + pre;
+-		split_mid->block_len = split_mid->len;
+-		split_mid->orig_block_len = split_mid->block_len;
+-		split_mid->ram_bytes = split_mid->len;
+-		split_mid->flags = flags;
+-		split_mid->compress_type = em->compress_type;
+-		split_mid->generation = em->generation;
+-		add_extent_mapping(em_tree, split_mid, 1);
+-	}
+-
+-	if (post) {
+-		split_post->start = em->start + em->len - post;
+-		split_post->len = post;
+-		split_post->orig_start = split_post->start;
+-		split_post->block_start = em->block_start + em->len - post;
+-		split_post->block_len = split_post->len;
+-		split_post->orig_block_len = split_post->block_len;
+-		split_post->ram_bytes = split_post->len;
+-		split_post->flags = flags;
+-		split_post->compress_type = em->compress_type;
+-		split_post->generation = em->generation;
+-		add_extent_mapping(em_tree, split_post, 1);
+-	}
++	 *     [em->start, em->start + pre]
++	 */
++
++	/* Insert the middle extent_map. */
++	split_mid->start = em->start + pre;
++	split_mid->len = em->len - pre;
++	split_mid->orig_start = split_mid->start;
++	split_mid->block_start = em->block_start + pre;
++	split_mid->block_len = split_mid->len;
++	split_mid->orig_block_len = split_mid->block_len;
++	split_mid->ram_bytes = split_mid->len;
++	split_mid->flags = flags;
++	split_mid->compress_type = em->compress_type;
++	split_mid->generation = em->generation;
++	add_extent_mapping(em_tree, split_mid, 1);
+ 
+ 	/* Once for us */
+ 	free_extent_map(em);
+@@ -2627,72 +2599,41 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+ out_unlock:
+ 	write_unlock(&em_tree->lock);
+ 	unlock_extent(&inode->io_tree, start, start + len - 1, NULL);
+-out:
+-	free_extent_map(split_pre);
+ 	free_extent_map(split_mid);
+-	free_extent_map(split_post);
+-
++out_free_pre:
++	free_extent_map(split_pre);
+ 	return ret;
+ }
+ 
+-blk_status_t btrfs_extract_ordered_extent(struct btrfs_bio *bbio)
++int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
++				 struct btrfs_ordered_extent *ordered)
+ {
+ 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
+ 	u64 len = bbio->bio.bi_iter.bi_size;
+ 	struct btrfs_inode *inode = bbio->inode;
+-	struct btrfs_ordered_extent *ordered;
+-	u64 file_len;
+-	u64 end = start + len;
+-	u64 ordered_end;
+-	u64 pre, post;
++	u64 ordered_len = ordered->num_bytes;
+ 	int ret = 0;
+ 
+-	ordered = btrfs_lookup_ordered_extent(inode, bbio->file_offset);
+-	if (WARN_ON_ONCE(!ordered))
+-		return BLK_STS_IOERR;
++	/* Must always be called for the beginning of an ordered extent. */
++	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
++		return -EINVAL;
+ 
+-	/* No need to split */
++	/* No need to split if the ordered extent covers the entire bio. */
+ 	if (ordered->disk_num_bytes == len)
+-		goto out;
+-
+-	/* We cannot split once end_bio'd ordered extent */
+-	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes)) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	/* We cannot split a compressed ordered extent */
+-	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes)) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	ordered_end = ordered->disk_bytenr + ordered->disk_num_bytes;
+-	/* bio must be in one ordered extent */
+-	if (WARN_ON_ONCE(start < ordered->disk_bytenr || end > ordered_end)) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	/* Checksum list should be empty */
+-	if (WARN_ON_ONCE(!list_empty(&ordered->list))) {
+-		ret = -EINVAL;
+-		goto out;
+-	}
+-
+-	file_len = ordered->num_bytes;
+-	pre = start - ordered->disk_bytenr;
+-	post = ordered_end - end;
++		return 0;
+ 
+-	ret = btrfs_split_ordered_extent(ordered, pre, post);
++	ret = btrfs_split_ordered_extent(ordered, len);
+ 	if (ret)
+-		goto out;
+-	ret = split_zoned_em(inode, bbio->file_offset, file_len, pre, post);
++		return ret;
+ 
+-out:
+-	btrfs_put_ordered_extent(ordered);
++	/*
++	 * Don't split the extent_map for NOCOW extents, as we're writing into
++	 * a pre-existing one.
++	 */
++	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
++		return 0;
+ 
+-	return errno_to_blk_status(ret);
++	return split_extent_map(inode, bbio->file_offset, ordered_len, len);
+ }
+ 
+ /*
+@@ -3367,13 +3308,6 @@ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+ 	return 0;
+ }
+ 
+-static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 offset)
+-{
+-	u64 offset_in_sectors = offset >> fs_info->sectorsize_bits;
+-
+-	return csums + offset_in_sectors * fs_info->csum_size;
+-}
+-
+ /*
+  * Verify the checksum of a single data sector.
+  *
+@@ -3411,7 +3345,8 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
+ 		return true;
+ 	}
+ 
+-	csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
++	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
++				fs_info->csum_size;
+ 	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
+ 				    csum_expected))
+ 		goto zeroit;
+@@ -3691,6 +3626,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
+ 			trans = btrfs_start_transaction(root, 1);
+ 			if (IS_ERR(trans)) {
+ 				ret = PTR_ERR(trans);
++				iput(inode);
+ 				goto out;
+ 			}
+ 			btrfs_debug(fs_info, "auto deleting %Lu",
+@@ -3698,8 +3634,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
+ 			ret = btrfs_del_orphan_item(trans, root,
+ 						    found_key.objectid);
+ 			btrfs_end_transaction(trans);
+-			if (ret)
++			if (ret) {
++				iput(inode);
+ 				goto out;
++			}
+ 			continue;
+ 		}
+ 
+@@ -4261,15 +4199,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
+ {
+ 	struct btrfs_root *root = dir->root;
+ 
+-	/*
+-	 * 1 for the possible orphan item
+-	 * 1 for the dir item
+-	 * 1 for the dir index
+-	 * 1 for the inode ref
+-	 * 1 for the inode
+-	 * 1 for the parent inode
+-	 */
+-	return btrfs_start_transaction_fallback_global_rsv(root, 6);
++	return btrfs_start_transaction_fallback_global_rsv(root,
++						   BTRFS_UNLINK_METADATA_UNITS);
+ }
+ 
+ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+@@ -5243,7 +5174,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+ {
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	struct btrfs_trans_handle *trans;
+-	u64 delayed_refs_extra = btrfs_calc_insert_metadata_size(fs_info, 1);
++	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
+ 	int ret;
+ 
+ 	/*
+@@ -5281,7 +5212,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
+ 		trans->block_rsv = &fs_info->trans_block_rsv;
+ 		trans->bytes_reserved = delayed_refs_extra;
+ 		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
+-					delayed_refs_extra, 1);
++					delayed_refs_extra, true);
+ 	}
+ 	return trans;
+ }
+@@ -5291,7 +5222,7 @@ void btrfs_evict_inode(struct inode *inode)
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ 	struct btrfs_trans_handle *trans;
+ 	struct btrfs_root *root = BTRFS_I(inode)->root;
+-	struct btrfs_block_rsv *rsv;
++	struct btrfs_block_rsv *rsv = NULL;
+ 	int ret;
+ 
+ 	trace_btrfs_inode_evict(inode);
+@@ -5308,18 +5239,18 @@ void btrfs_evict_inode(struct inode *inode)
+ 	    ((btrfs_root_refs(&root->root_item) != 0 &&
+ 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
+ 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
+-		goto no_delete;
++		goto out;
+ 
+ 	if (is_bad_inode(inode))
+-		goto no_delete;
++		goto out;
+ 
+ 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
+-		goto no_delete;
++		goto out;
+ 
+ 	if (inode->i_nlink > 0) {
+ 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
+ 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
+-		goto no_delete;
++		goto out;
+ 	}
+ 
+ 	/*
+@@ -5328,7 +5259,7 @@ void btrfs_evict_inode(struct inode *inode)
+ 	 */
+ 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
+ 	if (ret)
+-		goto no_delete;
++		goto out;
+ 
+ 	/*
+ 	 * This drops any pending insert or delete operations we have for this
+@@ -5340,7 +5271,7 @@ void btrfs_evict_inode(struct inode *inode)
+ 
+ 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
+ 	if (!rsv)
+-		goto no_delete;
++		goto out;
+ 	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
+ 	rsv->failfast = true;
+ 
+@@ -5356,16 +5287,21 @@ void btrfs_evict_inode(struct inode *inode)
+ 
+ 		trans = evict_refill_and_join(root, rsv);
+ 		if (IS_ERR(trans))
+-			goto free_rsv;
++			goto out;
+ 
+ 		trans->block_rsv = rsv;
+ 
+ 		ret = btrfs_truncate_inode_items(trans, root, &control);
+ 		trans->block_rsv = &fs_info->trans_block_rsv;
+ 		btrfs_end_transaction(trans);
+-		btrfs_btree_balance_dirty(fs_info);
++		/*
++		 * We have not added new delayed items for our inode after we
++		 * have flushed its delayed items, so no need to throttle on
++		 * delayed items. However we have modified extent buffers.
++		 */
++		btrfs_btree_balance_dirty_nodelay(fs_info);
+ 		if (ret && ret != -ENOSPC && ret != -EAGAIN)
+-			goto free_rsv;
++			goto out;
+ 		else if (!ret)
+ 			break;
+ 	}
+@@ -5387,9 +5323,8 @@ void btrfs_evict_inode(struct inode *inode)
+ 		btrfs_end_transaction(trans);
+ 	}
+ 
+-free_rsv:
++out:
+ 	btrfs_free_block_rsv(fs_info, rsv);
+-no_delete:
+ 	/*
+ 	 * If we didn't successfully delete, the orphan item will still be in
+ 	 * the tree and we'll retry on the next mount. Again, we might also want
+@@ -6981,6 +6916,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ }
+ 
+ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
++						  struct btrfs_dio_data *dio_data,
+ 						  const u64 start,
+ 						  const u64 len,
+ 						  const u64 orig_start,
+@@ -6991,7 +6927,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ 						  const int type)
+ {
+ 	struct extent_map *em = NULL;
+-	int ret;
++	struct btrfs_ordered_extent *ordered;
+ 
+ 	if (type != BTRFS_ORDERED_NOCOW) {
+ 		em = create_io_em(inode, start, len, orig_start, block_start,
+@@ -7001,18 +6937,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ 		if (IS_ERR(em))
+ 			goto out;
+ 	}
+-	ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+-				       block_len, 0,
+-				       (1 << type) |
+-				       (1 << BTRFS_ORDERED_DIRECT),
+-				       BTRFS_COMPRESS_NONE);
+-	if (ret) {
++	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
++					     block_start, block_len, 0,
++					     (1 << type) |
++					     (1 << BTRFS_ORDERED_DIRECT),
++					     BTRFS_COMPRESS_NONE);
++	if (IS_ERR(ordered)) {
+ 		if (em) {
+ 			free_extent_map(em);
+ 			btrfs_drop_extent_map_range(inode, start,
+ 						    start + len - 1, false);
+ 		}
+-		em = ERR_PTR(ret);
++		em = ERR_CAST(ordered);
++	} else {
++		ASSERT(!dio_data->ordered);
++		dio_data->ordered = ordered;
+ 	}
+  out:
+ 
+@@ -7020,6 +6959,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ }
+ 
+ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
++						  struct btrfs_dio_data *dio_data,
+ 						  u64 start, u64 len)
+ {
+ 	struct btrfs_root *root = inode->root;
+@@ -7035,7 +6975,7 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ 	if (ret)
+ 		return ERR_PTR(ret);
+ 
+-	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
++	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
+ 				     ins.objectid, ins.offset, ins.offset,
+ 				     ins.offset, BTRFS_ORDERED_REGULAR);
+ 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+@@ -7380,7 +7320,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 		}
+ 		space_reserved = true;
+ 
+-		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
++		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
+ 					      orig_start, block_start,
+ 					      len, orig_block_len,
+ 					      ram_bytes, type);
+@@ -7422,7 +7362,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 			goto out;
+ 		space_reserved = true;
+ 
+-		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
++		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+ 		if (IS_ERR(em)) {
+ 			ret = PTR_ERR(em);
+ 			goto out;
+@@ -7728,6 +7668,10 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ 				      pos + length - 1, NULL);
+ 		ret = -ENOTBLK;
+ 	}
++	if (write) {
++		btrfs_put_ordered_extent(dio_data->ordered);
++		dio_data->ordered = NULL;
++	}
+ 
+ 	if (write)
+ 		extent_changeset_free(dio_data->data_reserved);
+@@ -7767,14 +7711,34 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
+ 		container_of(bbio, struct btrfs_dio_private, bbio);
+ 	struct btrfs_dio_data *dio_data = iter->private;
+ 
+-	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
++	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
++		       btrfs_dio_end_io, bio->bi_private);
++	bbio->inode = BTRFS_I(iter->inode);
+ 	bbio->file_offset = file_offset;
+ 
+ 	dip->file_offset = file_offset;
+ 	dip->bytes = bio->bi_iter.bi_size;
+ 
+ 	dio_data->submitted += bio->bi_iter.bi_size;
+-	btrfs_submit_bio(bio, 0);
++
++	/*
++	 * Check if we are doing a partial write.  If we are, we need to split
++	 * the ordered extent to match the submitted bio.  Hang on to the
++	 * remaining unfinishable ordered_extent in dio_data so that it can be
++	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
++	 * remaining pages is blocked on the outstanding ordered extent.
++	 */
++	if (iter->flags & IOMAP_WRITE) {
++		int ret;
++
++		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
++		if (ret) {
++			btrfs_bio_end_io(bbio, errno_to_blk_status(ret));
++			return;
++		}
++	}
++
++	btrfs_submit_bio(bbio, 0);
+ }
+ 
+ static const struct iomap_ops btrfs_dio_iomap_ops = {
+@@ -7789,7 +7753,7 @@ static const struct iomap_dio_ops btrfs_dio_ops = {
+ 
+ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+ {
+-	struct btrfs_dio_data data;
++	struct btrfs_dio_data data = { 0 };
+ 
+ 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 			    IOMAP_DIO_PARTIAL, &data, done_before);
+@@ -7798,7 +7762,7 @@ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_be
+ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
+ 				  size_t done_before)
+ {
+-	struct btrfs_dio_data data;
++	struct btrfs_dio_data data = { 0 };
+ 
+ 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 			    IOMAP_DIO_PARTIAL, &data, done_before);
+@@ -9908,8 +9872,6 @@ static ssize_t btrfs_encoded_read_inline(
+ }
+ 
+ struct btrfs_encoded_read_private {
+-	struct btrfs_inode *inode;
+-	u64 file_offset;
+ 	wait_queue_head_t wait;
+ 	atomic_t pending;
+ 	blk_status_t status;
+@@ -9939,45 +9901,41 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+ 					  u64 file_offset, u64 disk_bytenr,
+ 					  u64 disk_io_size, struct page **pages)
+ {
++	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ 	struct btrfs_encoded_read_private priv = {
+-		.inode = inode,
+-		.file_offset = file_offset,
+ 		.pending = ATOMIC_INIT(1),
+ 	};
+ 	unsigned long i = 0;
+-	u64 cur = 0;
++	struct btrfs_bio *bbio;
+ 
+ 	init_waitqueue_head(&priv.wait);
+-	/* Submit bios for the extent, splitting due to bio limits as necessary. */
+-	while (cur < disk_io_size) {
+-		struct bio *bio = NULL;
+-		u64 remaining = disk_io_size - cur;
+-
+-		while (bio || remaining) {
+-			size_t bytes = min_t(u64, remaining, PAGE_SIZE);
+-
+-			if (!bio) {
+-				bio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ,
+-						      inode,
+-						      btrfs_encoded_read_endio,
+-						      &priv);
+-				bio->bi_iter.bi_sector =
+-					(disk_bytenr + cur) >> SECTOR_SHIFT;
+-			}
+ 
+-			if (!bytes ||
+-			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
+-				atomic_inc(&priv.pending);
+-				btrfs_submit_bio(bio, 0);
+-				bio = NULL;
+-				continue;
+-			}
++	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
++			       btrfs_encoded_read_endio, &priv);
++	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
++	bbio->inode = inode;
+ 
+-			i++;
+-			cur += bytes;
+-			remaining -= bytes;
++	do {
++		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
++
++		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
++			atomic_inc(&priv.pending);
++			btrfs_submit_bio(bbio, 0);
++
++			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
++					       btrfs_encoded_read_endio, &priv);
++			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
++			bbio->inode = inode;
++			continue;
+ 		}
+-	}
++
++		i++;
++		disk_bytenr += bytes;
++		disk_io_size -= bytes;
++	} while (disk_io_size);
++
++	atomic_inc(&priv.pending);
++	btrfs_submit_bio(bbio, 0);
+ 
+ 	if (atomic_dec_return(&priv.pending))
+ 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
+@@ -10398,13 +10356,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ 
+ 	btrfs_delalloc_release_extents(inode, num_bytes);
+ 
+-	if (btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
+-					  ins.offset, pages, nr_pages, 0, NULL,
+-					  false)) {
+-		btrfs_writepage_endio_finish_ordered(inode, pages[0], start, end, 0);
+-		ret = -EIO;
+-		goto out_pages;
+-	}
++	btrfs_submit_compressed_write(inode, start, num_bytes, ins.objectid,
++					  ins.offset, pages, nr_pages, 0, false);
+ 	ret = orig_count;
+ 	goto out;
+ 
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index ba769a1eb87a..25833b4eeaf5 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3161,6 +3161,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
+ 	if (IS_ERR(sa))
+ 		return PTR_ERR(sa);
+ 
++	if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
++		ret = -EOPNOTSUPP;
++		goto out;
++	}
++
+ 	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+ 		ret = mnt_want_write_file(file);
+ 		if (ret)
+diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
+index 870528d87526..3a496b0d3d2b 100644
+--- a/fs/btrfs/locking.c
++++ b/fs/btrfs/locking.c
+@@ -325,24 +325,12 @@ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
+  * acquire the lock.
+  */
+ 
+-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
++void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
+ {
+-	int ret;
+-
+-	ret = percpu_counter_init(&lock->writers, 0, GFP_KERNEL);
+-	if (ret)
+-		return ret;
+-
+ 	atomic_set(&lock->readers, 0);
++	atomic_set(&lock->writers, 0);
+ 	init_waitqueue_head(&lock->pending_readers);
+ 	init_waitqueue_head(&lock->pending_writers);
+-
+-	return 0;
+-}
+-
+-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock)
+-{
+-	percpu_counter_destroy(&lock->writers);
+ }
+ 
+ /* Return true if acquisition is successful, false otherwise */
+@@ -351,10 +339,10 @@ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
+ 	if (atomic_read(&lock->readers))
+ 		return false;
+ 
+-	percpu_counter_inc(&lock->writers);
++	atomic_inc(&lock->writers);
+ 
+ 	/* Ensure writers count is updated before we check for pending readers */
+-	smp_mb();
++	smp_mb__after_atomic();
+ 	if (atomic_read(&lock->readers)) {
+ 		btrfs_drew_write_unlock(lock);
+ 		return false;
+@@ -374,7 +362,7 @@ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
+ 
+ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
+ {
+-	percpu_counter_dec(&lock->writers);
++	atomic_dec(&lock->writers);
+ 	cond_wake_up(&lock->pending_readers);
+ }
+ 
+@@ -390,8 +378,7 @@ void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
+ 	 */
+ 	smp_mb__after_atomic();
+ 
+-	wait_event(lock->pending_readers,
+-		   percpu_counter_sum(&lock->writers) == 0);
++	wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
+ }
+ 
+ void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
+diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
+index 11c2269b4b6f..edb9b4a0dba1 100644
+--- a/fs/btrfs/locking.h
++++ b/fs/btrfs/locking.h
+@@ -195,13 +195,12 @@ static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
+ 
+ struct btrfs_drew_lock {
+ 	atomic_t readers;
+-	struct percpu_counter writers;
++	atomic_t writers;
+ 	wait_queue_head_t pending_writers;
+ 	wait_queue_head_t pending_readers;
+ };
+ 
+-int btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+-void btrfs_drew_lock_destroy(struct btrfs_drew_lock *lock);
++void btrfs_drew_lock_init(struct btrfs_drew_lock *lock);
+ void btrfs_drew_write_lock(struct btrfs_drew_lock *lock);
+ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock);
+ void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock);
+diff --git a/fs/btrfs/lru_cache.h b/fs/btrfs/lru_cache.h
+index de3e18bce24a..00328c856be6 100644
+--- a/fs/btrfs/lru_cache.h
++++ b/fs/btrfs/lru_cache.h
+@@ -55,11 +55,6 @@ static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *ca
+ 	return cache->size;
+ }
+ 
+-static inline bool btrfs_lru_cache_is_full(const struct btrfs_lru_cache *cache)
+-{
+-	return cache->size >= cache->max_size;
+-}
+-
+ static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
+ 					      struct btrfs_lru_cache *cache)
+ {
+diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
+index 71f6d8302d50..3a095b9c6373 100644
+--- a/fs/btrfs/lzo.c
++++ b/fs/btrfs/lzo.c
+@@ -17,6 +17,7 @@
+ #include "compression.h"
+ #include "ctree.h"
+ #include "super.h"
++#include "btrfs_inode.h"
+ 
+ #define LZO_LEN	4
+ 
+@@ -329,7 +330,7 @@ static void copy_compressed_segment(struct compressed_bio *cb,
+ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ {
+ 	struct workspace *workspace = list_entry(ws, struct workspace, list);
+-	const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
++	const struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info;
+ 	const u32 sectorsize = fs_info->sectorsize;
+ 	char *kaddr;
+ 	int ret;
+@@ -388,8 +389,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 			 */
+ 			btrfs_err(fs_info, "unexpectedly large lzo segment len %u",
+ 					seg_len);
+-			ret = -EIO;
+-			goto out;
++			return -EIO;
+ 		}
+ 
+ 		/* Copy the compressed segment payload into workspace */
+@@ -400,8 +400,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 					    workspace->buf, &out_len);
+ 		if (ret != LZO_E_OK) {
+ 			btrfs_err(fs_info, "failed to decompress");
+-			ret = -EIO;
+-			goto out;
++			return -EIO;
+ 		}
+ 
+ 		/* Copy the data into inode pages */
+@@ -410,7 +409,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 
+ 		/* All data read, exit */
+ 		if (ret == 0)
+-			goto out;
++			return 0;
+ 		ret = 0;
+ 
+ 		/* Check if the sector has enough space for a segment header */
+@@ -421,10 +420,8 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 		/* Skip the padding zeros */
+ 		cur_in += sector_bytes_left;
+ 	}
+-out:
+-	if (!ret)
+-		zero_fill_bio(cb->orig_bio);
+-	return ret;
++
++	return 0;
+ }
+ 
+ int lzo_decompress(struct list_head *ws, const u8 *data_in,
+diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
+index fde5aaa6e7c9..310a05cf95ef 100644
+--- a/fs/btrfs/messages.c
++++ b/fs/btrfs/messages.c
+@@ -253,7 +253,7 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt,
+ #endif
+ 
+ #ifdef CONFIG_BTRFS_ASSERT
+-void __cold btrfs_assertfail(const char *expr, const char *file, int line)
++void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line)
+ {
+ 	pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
+ 	BUG();
+diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
+index 8c516ee58ff9..ac2d1982ba3d 100644
+--- a/fs/btrfs/messages.h
++++ b/fs/btrfs/messages.h
+@@ -160,7 +160,7 @@ do {								\
+ } while (0)
+ 
+ #ifdef CONFIG_BTRFS_ASSERT
+-void __cold btrfs_assertfail(const char *expr, const char *file, int line);
++void __cold __noreturn btrfs_assertfail(const char *expr, const char *file, int line);
+ 
+ #define ASSERT(expr)						\
+ 	(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
+diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
+index 6c24b69e2d0a..a9778a91511e 100644
+--- a/fs/btrfs/ordered-data.c
++++ b/fs/btrfs/ordered-data.c
+@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+  * @compress_type:   Compression algorithm used for data.
+  *
+  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+- * tree is given a single reference on the ordered extent that was inserted.
++ * tree is given a single reference on the ordered extent that was inserted, and
++ * the returned pointer is given a second reference.
+  *
+- * Return: 0 or -ENOMEM.
++ * Return: the new ordered extent or error pointer.
+  */
+-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+-			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+-			     u64 disk_num_bytes, u64 offset, unsigned flags,
+-			     int compress_type)
++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
++			struct btrfs_inode *inode, u64 file_offset,
++			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			u64 disk_num_bytes, u64 offset, unsigned long flags,
++			int compress_type)
+ {
+ 	struct btrfs_root *root = inode->root;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 		/* For nocow write, we can release the qgroup rsv right now */
+ 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ 		if (ret < 0)
+-			return ret;
++			return ERR_PTR(ret);
+ 		ret = 0;
+ 	} else {
+ 		/*
+@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 		 */
+ 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ 		if (ret < 0)
+-			return ret;
++			return ERR_PTR(ret);
+ 	}
+ 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
+ 	if (!entry)
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 
+ 	entry->file_offset = file_offset;
+ 	entry->num_bytes = num_bytes;
+@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 	btrfs_mod_outstanding_extents(inode, 1);
+ 	spin_unlock(&inode->lock);
+ 
++	/* One ref for the returned entry to match semantics of lookup. */
++	refcount_inc(&entry->refs);
++
++	return entry;
++}
++
++/*
++ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
++ * of returning it to the caller.
++ */
++int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
++			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			     u64 disk_num_bytes, u64 offset, unsigned long flags,
++			     int compress_type)
++{
++	struct btrfs_ordered_extent *ordered;
++
++	ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
++					     ram_bytes, disk_bytenr,
++					     disk_num_bytes, offset, flags,
++					     compress_type);
++
++	if (IS_ERR(ordered))
++		return PTR_ERR(ordered);
++	btrfs_put_ordered_extent(ordered);
++
+ 	return 0;
+ }
+ 
+@@ -1088,39 +1116,37 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ 	return false;
+ }
+ 
+-
+-static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
+-				u64 len)
+-{
+-	struct inode *inode = ordered->inode;
+-	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+-	u64 file_offset = ordered->file_offset + pos;
+-	u64 disk_bytenr = ordered->disk_bytenr + pos;
+-	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
+-
+-	/*
+-	 * The splitting extent is already counted and will be added again in
+-	 * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
+-	 */
+-	percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
+-				 fs_info->delalloc_batch);
+-	WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
+-	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
+-					disk_bytenr, len, 0, flags,
+-					ordered->compress_type);
+-}
+-
+-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+-				u64 post)
++/* Split out a new ordered extent for this first @len bytes of @ordered. */
++int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len)
+ {
+ 	struct inode *inode = ordered->inode;
+ 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
+-	struct rb_node *node;
+ 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+-	int ret = 0;
++	u64 file_offset = ordered->file_offset;
++	u64 disk_bytenr = ordered->disk_bytenr;
++	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
++	struct rb_node *node;
+ 
+ 	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);
+ 
++	ASSERT(!(flags & (1U << BTRFS_ORDERED_COMPRESSED)));
++
++	/*
++	 * The entire bio must be covered by the ordered extent, but we can't
++	 * reduce the original extent to a zero length either.
++	 */
++	if (WARN_ON_ONCE(len >= ordered->num_bytes))
++		return -EINVAL;
++	/* We cannot split once ordered extent is past end_bio. */
++	if (WARN_ON_ONCE(ordered->bytes_left != ordered->disk_num_bytes))
++		return -EINVAL;
++	/* We cannot split a compressed ordered extent. */
++	if (WARN_ON_ONCE(ordered->disk_num_bytes != ordered->num_bytes))
++		return -EINVAL;
++	/* Checksum list should be empty. */
++	if (WARN_ON_ONCE(!list_empty(&ordered->list)))
++		return -EINVAL;
++
+ 	spin_lock_irq(&tree->lock);
+ 	/* Remove from tree once */
+ 	node = &ordered->rb_node;
+@@ -1129,11 +1155,11 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ 	if (tree->last == node)
+ 		tree->last = NULL;
+ 
+-	ordered->file_offset += pre;
+-	ordered->disk_bytenr += pre;
+-	ordered->num_bytes -= (pre + post);
+-	ordered->disk_num_bytes -= (pre + post);
+-	ordered->bytes_left -= (pre + post);
++	ordered->file_offset += len;
++	ordered->disk_bytenr += len;
++	ordered->num_bytes -= len;
++	ordered->disk_num_bytes -= len;
++	ordered->bytes_left -= len;
+ 
+ 	/* Re-insert the node */
+ 	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
+@@ -1144,13 +1170,15 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+ 
+ 	spin_unlock_irq(&tree->lock);
+ 
+-	if (pre)
+-		ret = clone_ordered_extent(ordered, 0, pre);
+-	if (ret == 0 && post)
+-		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
+-					   post);
++	/*
++	 * The splitting extent is already counted and will be added again in
++	 * btrfs_add_ordered_extent(). Subtract len to avoid double counting.
++	 */
++	percpu_counter_add_batch(&fs_info->ordered_bytes, -len, fs_info->delalloc_batch);
+ 
+-	return ret;
++	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
++					disk_bytenr, len, 0, flags,
++					ordered->compress_type);
+ }
+ 
+ int __init ordered_data_init(void)
+diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
+index eb40cb39f842..f0f1138d23c3 100644
+--- a/fs/btrfs/ordered-data.h
++++ b/fs/btrfs/ordered-data.h
+@@ -178,9 +178,14 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ 				    struct btrfs_ordered_extent **cached,
+ 				    u64 file_offset, u64 io_size);
++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
++			struct btrfs_inode *inode, u64 file_offset,
++			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			u64 disk_num_bytes, u64 offset, unsigned long flags,
++			int compress_type);
+ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+-			     u64 disk_num_bytes, u64 offset, unsigned flags,
++			     u64 disk_num_bytes, u64 offset, unsigned long flags,
+ 			     int compress_type);
+ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
+ 			   struct btrfs_ordered_sum *sum);
+@@ -207,8 +212,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
+ 					struct extent_state **cached_state);
+ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
+ 				  struct extent_state **cached_state);
+-int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
+-			       u64 post);
++int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 len);
+ int __init ordered_data_init(void);
+ void __cold ordered_data_exit(void);
+ 
+diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
+index 642828c1b299..2fab37f062de 100644
+--- a/fs/btrfs/raid56.c
++++ b/fs/btrfs/raid56.c
+@@ -202,7 +202,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+  */
+ static int rbio_bucket(struct btrfs_raid_bio *rbio)
+ {
+-	u64 num = rbio->bioc->raid_map[0];
++	u64 num = rbio->bioc->full_stripe_logical;
+ 
+ 	/*
+ 	 * we shift down quite a bit.  We're using byte
+@@ -407,16 +407,15 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+ {
+ 	struct btrfs_stripe_hash_table *table;
+-	unsigned long flags;
+ 
+ 	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+ 		return;
+ 
+ 	table = rbio->bioc->fs_info->stripe_hash_table;
+ 
+-	spin_lock_irqsave(&table->cache_lock, flags);
++	spin_lock(&table->cache_lock);
+ 	__remove_rbio_from_cache(rbio);
+-	spin_unlock_irqrestore(&table->cache_lock, flags);
++	spin_unlock(&table->cache_lock);
+ }
+ 
+ /*
+@@ -425,19 +424,18 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+ {
+ 	struct btrfs_stripe_hash_table *table;
+-	unsigned long flags;
+ 	struct btrfs_raid_bio *rbio;
+ 
+ 	table = info->stripe_hash_table;
+ 
+-	spin_lock_irqsave(&table->cache_lock, flags);
++	spin_lock(&table->cache_lock);
+ 	while (!list_empty(&table->stripe_cache)) {
+ 		rbio = list_entry(table->stripe_cache.next,
+ 				  struct btrfs_raid_bio,
+ 				  stripe_cache);
+ 		__remove_rbio_from_cache(rbio);
+ 	}
+-	spin_unlock_irqrestore(&table->cache_lock, flags);
++	spin_unlock(&table->cache_lock);
+ }
+ 
+ /*
+@@ -467,14 +465,13 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+ static void cache_rbio(struct btrfs_raid_bio *rbio)
+ {
+ 	struct btrfs_stripe_hash_table *table;
+-	unsigned long flags;
+ 
+ 	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+ 		return;
+ 
+ 	table = rbio->bioc->fs_info->stripe_hash_table;
+ 
+-	spin_lock_irqsave(&table->cache_lock, flags);
++	spin_lock(&table->cache_lock);
+ 	spin_lock(&rbio->bio_list_lock);
+ 
+ 	/* bump our ref if we were not in the list before */
+@@ -501,7 +498,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
+ 			__remove_rbio_from_cache(found);
+ 	}
+ 
+-	spin_unlock_irqrestore(&table->cache_lock, flags);
++	spin_unlock(&table->cache_lock);
+ }
+ 
+ /*
+@@ -530,15 +527,14 @@ static void run_xor(void **pages, int src_cnt, ssize_t len)
+  */
+ static int rbio_is_full(struct btrfs_raid_bio *rbio)
+ {
+-	unsigned long flags;
+ 	unsigned long size = rbio->bio_list_bytes;
+ 	int ret = 1;
+ 
+-	spin_lock_irqsave(&rbio->bio_list_lock, flags);
++	spin_lock(&rbio->bio_list_lock);
+ 	if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
+ 		ret = 0;
+ 	BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
+-	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
++	spin_unlock(&rbio->bio_list_lock);
+ 
+ 	return ret;
+ }
+@@ -571,7 +567,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
+ 	    test_bit(RBIO_CACHE_BIT, &cur->flags))
+ 		return 0;
+ 
+-	if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
++	if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
+ 		return 0;
+ 
+ 	/* we can't merge with different operations */
+@@ -657,16 +653,15 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+ 	struct btrfs_stripe_hash *h;
+ 	struct btrfs_raid_bio *cur;
+ 	struct btrfs_raid_bio *pending;
+-	unsigned long flags;
+ 	struct btrfs_raid_bio *freeit = NULL;
+ 	struct btrfs_raid_bio *cache_drop = NULL;
+ 	int ret = 0;
+ 
+ 	h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
+ 
+-	spin_lock_irqsave(&h->lock, flags);
++	spin_lock(&h->lock);
+ 	list_for_each_entry(cur, &h->hash_list, hash_list) {
+-		if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
++		if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
+ 			continue;
+ 
+ 		spin_lock(&cur->bio_list_lock);
+@@ -724,7 +719,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+ 	refcount_inc(&rbio->refs);
+ 	list_add(&rbio->hash_list, &h->hash_list);
+ out:
+-	spin_unlock_irqrestore(&h->lock, flags);
++	spin_unlock(&h->lock);
+ 	if (cache_drop)
+ 		remove_rbio_from_cache(cache_drop);
+ 	if (freeit)
+@@ -742,7 +737,6 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+ {
+ 	int bucket;
+ 	struct btrfs_stripe_hash *h;
+-	unsigned long flags;
+ 	int keep_cache = 0;
+ 
+ 	bucket = rbio_bucket(rbio);
+@@ -751,7 +745,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+ 	if (list_empty(&rbio->plug_list))
+ 		cache_rbio(rbio);
+ 
+-	spin_lock_irqsave(&h->lock, flags);
++	spin_lock(&h->lock);
+ 	spin_lock(&rbio->bio_list_lock);
+ 
+ 	if (!list_empty(&rbio->hash_list)) {
+@@ -788,7 +782,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+ 			list_add(&next->hash_list, &h->hash_list);
+ 			refcount_inc(&next->refs);
+ 			spin_unlock(&rbio->bio_list_lock);
+-			spin_unlock_irqrestore(&h->lock, flags);
++			spin_unlock(&h->lock);
+ 
+ 			if (next->operation == BTRFS_RBIO_READ_REBUILD)
+ 				start_async_work(next, recover_rbio_work_locked);
+@@ -808,7 +802,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+ 	}
+ done:
+ 	spin_unlock(&rbio->bio_list_lock);
+-	spin_unlock_irqrestore(&h->lock, flags);
++	spin_unlock(&h->lock);
+ 
+ done_nolock:
+ 	if (!keep_cache)
+@@ -891,16 +885,16 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ 	index = stripe_nr * rbio->stripe_nsectors + sector_nr;
+ 	ASSERT(index >= 0 && index < rbio->nr_sectors);
+ 
+-	spin_lock_irq(&rbio->bio_list_lock);
++	spin_lock(&rbio->bio_list_lock);
+ 	sector = &rbio->bio_sectors[index];
+ 	if (sector->page || bio_list_only) {
+ 		/* Don't return sector without a valid page pointer */
+ 		if (!sector->page)
+ 			sector = NULL;
+-		spin_unlock_irq(&rbio->bio_list_lock);
++		spin_unlock(&rbio->bio_list_lock);
+ 		return sector;
+ 	}
+-	spin_unlock_irq(&rbio->bio_list_lock);
++	spin_unlock(&rbio->bio_list_lock);
+ 
+ 	return &rbio->stripe_sectors[index];
+ }
+@@ -912,7 +906,7 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
+ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
+ 					 struct btrfs_io_context *bioc)
+ {
+-	const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
++	const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
+ 	const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
+ 	const unsigned int num_pages = stripe_npages * real_stripes;
+ 	const unsigned int stripe_nsectors =
+@@ -1108,7 +1102,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
+ 	bio->bi_iter.bi_sector = disk_start >> 9;
+ 	bio->bi_private = rbio;
+ 
+-	bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
++	__bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
+ 	bio_list_add(bio_list, bio);
+ 	return 0;
+ }
+@@ -1119,7 +1113,7 @@ static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
+ 	struct bio_vec bvec;
+ 	struct bvec_iter iter;
+ 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+-		     rbio->bioc->raid_map[0];
++		     rbio->bioc->full_stripe_logical;
+ 
+ 	bio_for_each_segment(bvec, bio, iter) {
+ 		u32 bvec_offset;
+@@ -1148,11 +1142,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+ {
+ 	struct bio *bio;
+ 
+-	spin_lock_irq(&rbio->bio_list_lock);
++	spin_lock(&rbio->bio_list_lock);
+ 	bio_list_for_each(bio, &rbio->bio_list)
+ 		index_one_bio(rbio, bio);
+ 
+-	spin_unlock_irq(&rbio->bio_list_lock);
++	spin_unlock(&rbio->bio_list_lock);
+ }
+ 
+ static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
+@@ -1282,10 +1276,16 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ 			goto error;
+ 	}
+ 
+-	if (likely(!rbio->bioc->num_tgtdevs))
++	if (likely(!rbio->bioc->replace_nr_stripes))
+ 		return 0;
+ 
+-	/* Make a copy for the replace target device. */
++	/*
++	 * Make a copy for the replace target device.
++	 *
++	 * Thus the source stripe number (in replace_stripe_src) should be valid.
++	 */
++	ASSERT(rbio->bioc->replace_stripe_src >= 0);
++
+ 	for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
+ 	     total_sector_nr++) {
+ 		struct sector_ptr *sector;
+@@ -1293,7 +1293,12 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ 		stripe = total_sector_nr / rbio->stripe_nsectors;
+ 		sectornr = total_sector_nr % rbio->stripe_nsectors;
+ 
+-		if (!rbio->bioc->tgtdev_map[stripe]) {
++		/*
++		 * For RAID56, there is only one device that can be replaced,
++		 * and replace_stripe_src[0] indicates the stripe number we
++		 * need to copy from.
++		 */
++		if (stripe != rbio->bioc->replace_stripe_src) {
+ 			/*
+ 			 * We can skip the whole stripe completely, note
+ 			 * total_sector_nr will be increased by one anyway.
+@@ -1316,7 +1321,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
+ 		}
+ 
+ 		ret = rbio_add_io_sector(rbio, bio_list, sector,
+-					 rbio->bioc->tgtdev_map[stripe],
++					 rbio->real_stripes,
+ 					 sectornr, REQ_OP_WRITE);
+ 		if (ret)
+ 			goto error;
+@@ -1332,7 +1337,7 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
+ {
+ 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ 	u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
+-		     rbio->bioc->raid_map[0];
++		     rbio->bioc->full_stripe_logical;
+ 	int total_nr_sector = offset >> fs_info->sectorsize_bits;
+ 
+ 	ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
+@@ -1609,7 +1614,7 @@ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+ {
+ 	const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ 	const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+-	const u64 full_stripe_start = rbio->bioc->raid_map[0];
++	const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
+ 	const u32 orig_len = orig_bio->bi_iter.bi_size;
+ 	const u32 sectorsize = fs_info->sectorsize;
+ 	u64 cur_logical;
+@@ -1796,9 +1801,8 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ 		 * here due to a crc mismatch and we can't give them the
+ 		 * data they want.
+ 		 */
+-		if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
+-			if (rbio->bioc->raid_map[faila] ==
+-			    RAID5_P_STRIPE)
++		if (failb == rbio->real_stripes - 1) {
++			if (faila == rbio->real_stripes - 2)
+ 				/*
+ 				 * Only P and Q are corrupted.
+ 				 * We only care about data stripes recovery,
+@@ -1812,7 +1816,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
+ 			goto pstripe;
+ 		}
+ 
+-		if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
++		if (failb == rbio->real_stripes - 2) {
+ 			raid6_datap_recov(rbio->real_stripes, sectorsize,
+ 					  faila, pointers);
+ 		} else {
+@@ -1895,9 +1899,9 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
+ 
+ 	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
+ 	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
+-		spin_lock_irq(&rbio->bio_list_lock);
++		spin_lock(&rbio->bio_list_lock);
+ 		set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+-		spin_unlock_irq(&rbio->bio_list_lock);
++		spin_unlock(&rbio->bio_list_lock);
+ 	}
+ 
+ 	index_rbio_pages(rbio);
+@@ -2075,8 +2079,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
+ {
+ 	struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
+ 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
+-						       rbio->bioc->raid_map[0]);
+-	const u64 start = rbio->bioc->raid_map[0];
++						       rbio->bioc->full_stripe_logical);
++	const u64 start = rbio->bioc->full_stripe_logical;
+ 	const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
+ 			fs_info->sectorsize_bits;
+ 	int ret;
+@@ -2109,7 +2113,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
+ 	}
+ 
+ 	ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1,
+-					rbio->csum_buf, rbio->csum_bitmap);
++					rbio->csum_buf, rbio->csum_bitmap, false);
+ 	if (ret < 0)
+ 		goto error;
+ 	if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
+@@ -2124,7 +2128,7 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio)
+ 	 */
+ 	btrfs_warn_rl(fs_info,
+ "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
+-			rbio->bioc->raid_map[0], ret);
++			rbio->bioc->full_stripe_logical, ret);
+ no_csum:
+ 	kfree(rbio->csum_buf);
+ 	bitmap_free(rbio->csum_bitmap);
+@@ -2265,9 +2269,9 @@ static void rmw_rbio(struct btrfs_raid_bio *rbio)
+ 	 * bio list any more, anyone else that wants to change this stripe
+ 	 * needs to do their own rmw.
+ 	 */
+-	spin_lock_irq(&rbio->bio_list_lock);
++	spin_lock(&rbio->bio_list_lock);
+ 	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+-	spin_unlock_irq(&rbio->bio_list_lock);
++	spin_unlock(&rbio->bio_list_lock);
+ 
+ 	bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
+ 
+@@ -2372,23 +2376,6 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ 	return rbio;
+ }
+ 
+-/* Used for both parity scrub and missing. */
+-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+-			    unsigned int pgoff, u64 logical)
+-{
+-	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
+-	int stripe_offset;
+-	int index;
+-
+-	ASSERT(logical >= rbio->bioc->raid_map[0]);
+-	ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
+-				       BTRFS_STRIPE_LEN * rbio->nr_data);
+-	stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
+-	index = stripe_offset / sectorsize;
+-	rbio->bio_sectors[index].page = page;
+-	rbio->bio_sectors[index].pgoff = pgoff;
+-}
+-
+ /*
+  * We just scrub the parity that we have correct data on the same horizontal,
+  * so we needn't allocate all pages for all the stripes.
+@@ -2442,7 +2429,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+ 	else
+ 		BUG();
+ 
+-	if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
++	/*
++	 * Replace is running and our P/Q stripe is being replaced, then we
++	 * need to duplicate the final write to replace target.
++	 */
++	if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
+ 		is_replace = 1;
+ 		bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
+ 	}
+@@ -2544,13 +2535,18 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check)
+ 	if (!is_replace)
+ 		goto submit_write;
+ 
++	/*
++	 * Replace is running and our parity stripe needs to be duplicated to
++	 * the target device.  Check we have a valid source stripe number.
++	 */
++	ASSERT(rbio->bioc->replace_stripe_src >= 0);
+ 	for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
+ 		struct sector_ptr *sector;
+ 
+ 		sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
+ 		ret = rbio_add_io_sector(rbio, &bio_list, sector,
+-				       bioc->tgtdev_map[rbio->scrubp],
+-				       sectornr, REQ_OP_WRITE);
++					 rbio->real_stripes,
++					 sectornr, REQ_OP_WRITE);
+ 		if (ret)
+ 			goto cleanup;
+ 	}
+@@ -2751,33 +2747,3 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
+ 	if (!lock_stripe_add(rbio))
+ 		start_async_work(rbio, scrub_rbio_work_locked);
+ }
+-
+-/* The following code is used for dev replace of a missing RAID 5/6 device. */
+-
+-struct btrfs_raid_bio *
+-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc)
+-{
+-	struct btrfs_fs_info *fs_info = bioc->fs_info;
+-	struct btrfs_raid_bio *rbio;
+-
+-	rbio = alloc_rbio(fs_info, bioc);
+-	if (IS_ERR(rbio))
+-		return NULL;
+-
+-	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
+-	bio_list_add(&rbio->bio_list, bio);
+-	/*
+-	 * This is a special bio which is used to hold the completion handler
+-	 * and make the scrub rbio is similar to the other types
+-	 */
+-	ASSERT(!bio->bi_iter.bi_size);
+-
+-	set_rbio_range_error(rbio, bio);
+-
+-	return rbio;
+-}
+-
+-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
+-{
+-	start_async_work(rbio, recover_rbio_work);
+-}
+diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
+index df0e0abdeb1f..0f7f31c8cb98 100644
+--- a/fs/btrfs/raid56.h
++++ b/fs/btrfs/raid56.h
+@@ -170,6 +170,11 @@ static inline int nr_data_stripes(const struct map_lookup *map)
+ 	return map->num_stripes - btrfs_nr_parity_stripes(map->type);
+ }
+ 
++static inline int nr_bioc_data_stripes(const struct btrfs_io_context *bioc)
++{
++	return bioc->num_stripes - btrfs_nr_parity_stripes(bioc->map_type);
++}
++
+ #define RAID5_P_STRIPE ((u64)-2)
+ #define RAID6_Q_STRIPE ((u64)-1)
+ 
+@@ -182,19 +187,12 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
+ 			   int mirror_num);
+ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc);
+ 
+-void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
+-			    unsigned int pgoff, u64 logical);
+-
+ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
+ 				struct btrfs_io_context *bioc,
+ 				struct btrfs_device *scrub_dev,
+ 				unsigned long *dbitmap, int stripe_nsectors);
+ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);
+ 
+-struct btrfs_raid_bio *
+-raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc);
+-void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
+-
+ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+ 
+diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
+index ef13a9d4e370..09b1988d1791 100644
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -1266,7 +1266,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
+ 		level = btrfs_header_level(parent);
+ 		ASSERT(level >= lowest_level);
+ 
+-		ret = btrfs_bin_search(parent, &key, &slot);
++		ret = btrfs_bin_search(parent, 0, &key, &slot);
+ 		if (ret < 0)
+ 			break;
+ 		if (ret && slot > 0)
+@@ -2407,7 +2407,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
+ 
+ 		if (upper->eb && !upper->locked) {
+ 			if (!lowest) {
+-				ret = btrfs_bin_search(upper->eb, key, &slot);
++				ret = btrfs_bin_search(upper->eb, 0, key, &slot);
+ 				if (ret < 0)
+ 					goto next;
+ 				BUG_ON(ret);
+@@ -2441,7 +2441,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
+ 			slot = path->slots[upper->level];
+ 			btrfs_release_path(path);
+ 		} else {
+-			ret = btrfs_bin_search(upper->eb, key, &slot);
++			ret = btrfs_bin_search(upper->eb, 0, key, &slot);
+ 			if (ret < 0)
+ 				goto next;
+ 			BUG_ON(ret);
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 69c93ae333f6..836725a19661 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -38,18 +38,14 @@
+  *  - add a mode to also read unallocated space
+  */
+ 
+-struct scrub_block;
+ struct scrub_ctx;
+ 
+ /*
+- * The following three values only influence the performance.
++ * The following value only influences the performance.
+  *
+- * The last one configures the number of parallel and outstanding I/O
+- * operations. The first one configures an upper limit for the number
+- * of (dynamically allocated) pages that are added to a bio.
++ * This determines the batch size for stripe submitted in one go.
+  */
+-#define SCRUB_SECTORS_PER_BIO	32	/* 128KiB per bio for 4KiB pages */
+-#define SCRUB_BIOS_PER_SCTX	64	/* 8MiB per device in flight for 4KiB pages */
++#define SCRUB_STRIPES_PER_SCTX	8	/* That would be 8 64K stripe per-device. */
+ 
+ /*
+  * The following value times PAGE_SIZE needs to be large enough to match the
+@@ -57,128 +53,124 @@ struct scrub_ctx;
+  */
+ #define SCRUB_MAX_SECTORS_PER_BLOCK	(BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
+ 
+-#define SCRUB_MAX_PAGES			(DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
++/* Represent one sector and its needed info to verify the content. */
++struct scrub_sector_verification {
++	bool is_metadata;
+ 
+-/*
+- * Maximum number of mirrors that can be available for all profiles counting
+- * the target device of dev-replace as one. During an active device replace
+- * procedure, the target device of the copy operation is a mirror for the
+- * filesystem data as well that can be used to read data in order to repair
+- * read errors on other disks.
+- *
+- * Current value is derived from RAID1C4 with 4 copies.
+- */
+-#define BTRFS_MAX_MIRRORS (4 + 1)
++	union {
++		/*
++		 * Csum pointer for data csum verification.  Should point to a
++		 * sector csum inside scrub_stripe::csums.
++		 *
++		 * NULL if this data sector has no csum.
++		 */
++		u8 *csum;
+ 
+-struct scrub_recover {
+-	refcount_t		refs;
+-	struct btrfs_io_context	*bioc;
+-	u64			map_length;
++		/*
++		 * Extra info for metadata verification.  All sectors inside a
++		 * tree block share the same generation.
++		 */
++		u64 generation;
++	};
+ };
+ 
+-struct scrub_sector {
+-	struct scrub_block	*sblock;
+-	struct list_head	list;
+-	u64			flags;  /* extent flags */
+-	u64			generation;
+-	/* Offset in bytes to @sblock. */
+-	u32			offset;
+-	atomic_t		refs;
+-	unsigned int		have_csum:1;
+-	unsigned int		io_error:1;
+-	u8			csum[BTRFS_CSUM_SIZE];
+-
+-	struct scrub_recover	*recover;
+-};
++enum scrub_stripe_flags {
++	/* Set when @mirror_num, @dev, @physical and @logical are set. */
++	SCRUB_STRIPE_FLAG_INITIALIZED,
+ 
+-struct scrub_bio {
+-	int			index;
+-	struct scrub_ctx	*sctx;
+-	struct btrfs_device	*dev;
+-	struct bio		*bio;
+-	blk_status_t		status;
+-	u64			logical;
+-	u64			physical;
+-	struct scrub_sector	*sectors[SCRUB_SECTORS_PER_BIO];
+-	int			sector_count;
+-	int			next_free;
+-	struct work_struct	work;
+-};
++	/* Set when the read-repair is finished. */
++	SCRUB_STRIPE_FLAG_REPAIR_DONE,
+ 
+-struct scrub_block {
+ 	/*
+-	 * Each page will have its page::private used to record the logical
+-	 * bytenr.
++	 * Set for data stripes if it's triggered from P/Q stripe.
++	 * During such scrub, we should not report errors in data stripes, nor
++	 * update the accounting.
+ 	 */
+-	struct page		*pages[SCRUB_MAX_PAGES];
+-	struct scrub_sector	*sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
+-	struct btrfs_device	*dev;
+-	/* Logical bytenr of the sblock */
+-	u64			logical;
+-	u64			physical;
+-	u64			physical_for_dev_replace;
+-	/* Length of sblock in bytes */
+-	u32			len;
+-	int			sector_count;
+-	int			mirror_num;
+-
+-	atomic_t		outstanding_sectors;
+-	refcount_t		refs; /* free mem on transition to zero */
+-	struct scrub_ctx	*sctx;
+-	struct scrub_parity	*sparity;
+-	struct {
+-		unsigned int	header_error:1;
+-		unsigned int	checksum_error:1;
+-		unsigned int	no_io_error_seen:1;
+-		unsigned int	generation_error:1; /* also sets header_error */
+-
+-		/* The following is for the data used to check parity */
+-		/* It is for the data with checksum */
+-		unsigned int	data_corrected:1;
+-	};
+-	struct work_struct	work;
++	SCRUB_STRIPE_FLAG_NO_REPORT,
+ };
+ 
+-/* Used for the chunks with parity stripe such RAID5/6 */
+-struct scrub_parity {
+-	struct scrub_ctx	*sctx;
++#define SCRUB_STRIPE_PAGES		(BTRFS_STRIPE_LEN / PAGE_SIZE)
++
++/*
++ * Represent one contiguous range with a length of BTRFS_STRIPE_LEN.
++ */
++struct scrub_stripe {
++	struct scrub_ctx *sctx;
++	struct btrfs_block_group *bg;
++
++	struct page *pages[SCRUB_STRIPE_PAGES];
++	struct scrub_sector_verification *sectors;
++
++	struct btrfs_device *dev;
++	u64 logical;
++	u64 physical;
+ 
+-	struct btrfs_device	*scrub_dev;
++	u16 mirror_num;
+ 
+-	u64			logic_start;
++	/* Should be BTRFS_STRIPE_LEN / sectorsize. */
++	u16 nr_sectors;
++
++	/*
++	 * How many data/meta extents are in this stripe.  Only for scrub status
++	 * reporting purposes.
++	 */
++	u16 nr_data_extents;
++	u16 nr_meta_extents;
+ 
+-	u64			logic_end;
++	atomic_t pending_io;
++	wait_queue_head_t io_wait;
++	wait_queue_head_t repair_wait;
+ 
+-	int			nsectors;
++	/*
++	 * Indicate the states of the stripe.  Bits are defined in
++	 * scrub_stripe_flags enum.
++	 */
++	unsigned long state;
+ 
+-	u32			stripe_len;
++	/* Indicate which sectors are covered by extent items. */
++	unsigned long extent_sector_bitmap;
+ 
+-	refcount_t		refs;
++	/*
++	 * The errors hit during the initial read of the stripe.
++	 *
++	 * Would be utilized for error reporting and repair.
++	 */
++	unsigned long init_error_bitmap;
+ 
+-	struct list_head	sectors_list;
++	/*
++	 * The following error bitmaps are all for the current status.
++	 * Every time we submit a new read, these bitmaps may be updated.
++	 *
++	 * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap;
++	 *
++	 * IO and csum errors can happen for both metadata and data.
++	 */
++	unsigned long error_bitmap;
++	unsigned long io_error_bitmap;
++	unsigned long csum_error_bitmap;
++	unsigned long meta_error_bitmap;
+ 
+-	/* Work of parity check and repair */
+-	struct work_struct	work;
++	/* For writeback (repair or replace) error reporting. */
++	unsigned long write_error_bitmap;
+ 
+-	/* Mark the parity blocks which have data */
+-	unsigned long		dbitmap;
++	/* Writeback can be concurrent, thus we need to protect the bitmap. */
++	spinlock_t write_error_lock;
+ 
+ 	/*
+-	 * Mark the parity blocks which have data, but errors happen when
+-	 * read data or check data
++	 * Checksum for the whole stripe if this stripe is inside a data block
++	 * group.
+ 	 */
+-	unsigned long		ebitmap;
++	u8 *csums;
++
++	struct work_struct work;
+ };
+ 
+ struct scrub_ctx {
+-	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
++	struct scrub_stripe	stripes[SCRUB_STRIPES_PER_SCTX];
++	struct scrub_stripe	*raid56_data_stripes;
+ 	struct btrfs_fs_info	*fs_info;
+ 	int			first_free;
+-	int			curr;
+-	atomic_t		bios_in_flight;
+-	atomic_t		workers_pending;
+-	spinlock_t		list_lock;
+-	wait_queue_head_t	list_wait;
++	int			cur_stripe;
+ 	struct list_head	csum_list;
+ 	atomic_t		cancel_req;
+ 	int			readonly;
+@@ -191,10 +183,8 @@ struct scrub_ctx {
+ 	int			is_dev_replace;
+ 	u64			write_pointer;
+ 
+-	struct scrub_bio        *wr_curr_bio;
+ 	struct mutex            wr_lock;
+ 	struct btrfs_device     *wr_tgtdev;
+-	bool                    flush_all_writes;
+ 
+ 	/*
+ 	 * statistics
+@@ -221,239 +211,66 @@ struct scrub_warning {
+ 	struct btrfs_device	*dev;
+ };
+ 
+-struct full_stripe_lock {
+-	struct rb_node node;
+-	u64 logical;
+-	u64 refs;
+-	struct mutex mutex;
+-};
+-
+-#ifndef CONFIG_64BIT
+-/* This structure is for architectures whose (void *) is smaller than u64 */
+-struct scrub_page_private {
+-	u64 logical;
+-};
+-#endif
+-
+-static int attach_scrub_page_private(struct page *page, u64 logical)
+-{
+-#ifdef CONFIG_64BIT
+-	attach_page_private(page, (void *)logical);
+-	return 0;
+-#else
+-	struct scrub_page_private *spp;
+-
+-	spp = kmalloc(sizeof(*spp), GFP_KERNEL);
+-	if (!spp)
+-		return -ENOMEM;
+-	spp->logical = logical;
+-	attach_page_private(page, (void *)spp);
+-	return 0;
+-#endif
+-}
+-
+-static void detach_scrub_page_private(struct page *page)
+-{
+-#ifdef CONFIG_64BIT
+-	detach_page_private(page);
+-	return;
+-#else
+-	struct scrub_page_private *spp;
+-
+-	spp = detach_page_private(page);
+-	kfree(spp);
+-	return;
+-#endif
+-}
+-
+-static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
+-					     struct btrfs_device *dev,
+-					     u64 logical, u64 physical,
+-					     u64 physical_for_dev_replace,
+-					     int mirror_num)
+-{
+-	struct scrub_block *sblock;
+-
+-	sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
+-	if (!sblock)
+-		return NULL;
+-	refcount_set(&sblock->refs, 1);
+-	sblock->sctx = sctx;
+-	sblock->logical = logical;
+-	sblock->physical = physical;
+-	sblock->physical_for_dev_replace = physical_for_dev_replace;
+-	sblock->dev = dev;
+-	sblock->mirror_num = mirror_num;
+-	sblock->no_io_error_seen = 1;
+-	/*
+-	 * Scrub_block::pages will be allocated at alloc_scrub_sector() when
+-	 * the corresponding page is not allocated.
+-	 */
+-	return sblock;
+-}
+-
+-/*
+- * Allocate a new scrub sector and attach it to @sblock.
+- *
+- * Will also allocate new pages for @sblock if needed.
+- */
+-static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
+-					       u64 logical)
++static void release_scrub_stripe(struct scrub_stripe *stripe)
+ {
+-	const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
+-	struct scrub_sector *ssector;
+-
+-	/* We must never have scrub_block exceed U32_MAX in size. */
+-	ASSERT(logical - sblock->logical < U32_MAX);
+-
+-	ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
+-	if (!ssector)
+-		return NULL;
+-
+-	/* Allocate a new page if the slot is not allocated */
+-	if (!sblock->pages[page_index]) {
+-		int ret;
++	if (!stripe)
++		return;
+ 
+-		sblock->pages[page_index] = alloc_page(GFP_KERNEL);
+-		if (!sblock->pages[page_index]) {
+-			kfree(ssector);
+-			return NULL;
+-		}
+-		ret = attach_scrub_page_private(sblock->pages[page_index],
+-				sblock->logical + (page_index << PAGE_SHIFT));
+-		if (ret < 0) {
+-			kfree(ssector);
+-			__free_page(sblock->pages[page_index]);
+-			sblock->pages[page_index] = NULL;
+-			return NULL;
+-		}
++	for (int i = 0; i < SCRUB_STRIPE_PAGES; i++) {
++		if (stripe->pages[i])
++			__free_page(stripe->pages[i]);
++		stripe->pages[i] = NULL;
+ 	}
+-
+-	atomic_set(&ssector->refs, 1);
+-	ssector->sblock = sblock;
+-	/* The sector to be added should not be used */
+-	ASSERT(sblock->sectors[sblock->sector_count] == NULL);
+-	ssector->offset = logical - sblock->logical;
+-
+-	/* The sector count must be smaller than the limit */
+-	ASSERT(sblock->sector_count < SCRUB_MAX_SECTORS_PER_BLOCK);
+-
+-	sblock->sectors[sblock->sector_count] = ssector;
+-	sblock->sector_count++;
+-	sblock->len += sblock->sctx->fs_info->sectorsize;
+-
+-	return ssector;
+-}
+-
+-static struct page *scrub_sector_get_page(struct scrub_sector *ssector)
+-{
+-	struct scrub_block *sblock = ssector->sblock;
+-	pgoff_t index;
+-	/*
+-	 * When calling this function, ssector must be alreaday attached to the
+-	 * parent sblock.
+-	 */
+-	ASSERT(sblock);
+-
+-	/* The range should be inside the sblock range */
+-	ASSERT(ssector->offset < sblock->len);
+-
+-	index = ssector->offset >> PAGE_SHIFT;
+-	ASSERT(index < SCRUB_MAX_PAGES);
+-	ASSERT(sblock->pages[index]);
+-	ASSERT(PagePrivate(sblock->pages[index]));
+-	return sblock->pages[index];
++	kfree(stripe->sectors);
++	kfree(stripe->csums);
++	stripe->sectors = NULL;
++	stripe->csums = NULL;
++	stripe->sctx = NULL;
++	stripe->state = 0;
+ }
+ 
+-static unsigned int scrub_sector_get_page_offset(struct scrub_sector *ssector)
++static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
++			     struct scrub_stripe *stripe)
+ {
+-	struct scrub_block *sblock = ssector->sblock;
++	int ret;
+ 
+-	/*
+-	 * When calling this function, ssector must be already attached to the
+-	 * parent sblock.
+-	 */
+-	ASSERT(sblock);
++	memset(stripe, 0, sizeof(*stripe));
+ 
+-	/* The range should be inside the sblock range */
+-	ASSERT(ssector->offset < sblock->len);
++	stripe->nr_sectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
++	stripe->state = 0;
+ 
+-	return offset_in_page(ssector->offset);
+-}
++	init_waitqueue_head(&stripe->io_wait);
++	init_waitqueue_head(&stripe->repair_wait);
++	atomic_set(&stripe->pending_io, 0);
++	spin_lock_init(&stripe->write_error_lock);
+ 
+-static char *scrub_sector_get_kaddr(struct scrub_sector *ssector)
+-{
+-	return page_address(scrub_sector_get_page(ssector)) +
+-	       scrub_sector_get_page_offset(ssector);
++	ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
++	if (ret < 0)
++		goto error;
++
++	stripe->sectors = kcalloc(stripe->nr_sectors,
++				  sizeof(struct scrub_sector_verification),
++				  GFP_KERNEL);
++	if (!stripe->sectors)
++		goto error;
++
++	stripe->csums = kcalloc(BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits,
++				fs_info->csum_size, GFP_KERNEL);
++	if (!stripe->csums)
++		goto error;
++	return 0;
++error:
++	release_scrub_stripe(stripe);
++	return -ENOMEM;
+ }
+ 
+-static int bio_add_scrub_sector(struct bio *bio, struct scrub_sector *ssector,
+-				unsigned int len)
++static void wait_scrub_stripe_io(struct scrub_stripe *stripe)
+ {
+-	return bio_add_page(bio, scrub_sector_get_page(ssector), len,
+-			    scrub_sector_get_page_offset(ssector));
++	wait_event(stripe->io_wait, atomic_read(&stripe->pending_io) == 0);
+ }
+ 
+-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+-				     struct scrub_block *sblocks_for_recheck[]);
+-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+-				struct scrub_block *sblock,
+-				int retry_failed_mirror);
+-static void scrub_recheck_block_checksum(struct scrub_block *sblock);
+-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+-					     struct scrub_block *sblock_good);
+-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+-					    struct scrub_block *sblock_good,
+-					    int sector_num, int force_write);
+-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
+-					     int sector_num);
+-static int scrub_checksum_data(struct scrub_block *sblock);
+-static int scrub_checksum_tree_block(struct scrub_block *sblock);
+-static int scrub_checksum_super(struct scrub_block *sblock);
+-static void scrub_block_put(struct scrub_block *sblock);
+-static void scrub_sector_get(struct scrub_sector *sector);
+-static void scrub_sector_put(struct scrub_sector *sector);
+-static void scrub_parity_get(struct scrub_parity *sparity);
+-static void scrub_parity_put(struct scrub_parity *sparity);
+-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+-			 u64 physical, struct btrfs_device *dev, u64 flags,
+-			 u64 gen, int mirror_num, u8 *csum,
+-			 u64 physical_for_dev_replace);
+-static void scrub_bio_end_io(struct bio *bio);
+-static void scrub_bio_end_io_worker(struct work_struct *work);
+-static void scrub_block_complete(struct scrub_block *sblock);
+-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+-				 u64 extent_logical, u32 extent_len,
+-				 u64 *extent_physical,
+-				 struct btrfs_device **extent_dev,
+-				 int *extent_mirror_num);
+-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+-				      struct scrub_sector *sector);
+-static void scrub_wr_submit(struct scrub_ctx *sctx);
+-static void scrub_wr_bio_end_io(struct bio *bio);
+-static void scrub_wr_bio_end_io_worker(struct work_struct *work);
+ static void scrub_put_ctx(struct scrub_ctx *sctx);
+ 
+-static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
+-{
+-	return sector->recover &&
+-	       (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
+-}
+-
+-static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+-{
+-	refcount_inc(&sctx->refs);
+-	atomic_inc(&sctx->bios_in_flight);
+-}
+-
+-static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+-{
+-	atomic_dec(&sctx->bios_in_flight);
+-	wake_up(&sctx->list_wait);
+-	scrub_put_ctx(sctx);
+-}
+-
+ static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+ {
+ 	while (atomic_read(&fs_info->scrub_pause_req)) {
+@@ -486,223 +303,6 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
+ 	scrub_pause_off(fs_info);
+ }
+ 
+-/*
+- * Insert new full stripe lock into full stripe locks tree
+- *
+- * Return pointer to existing or newly inserted full_stripe_lock structure if
+- * everything works well.
+- * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
+- *
+- * NOTE: caller must hold full_stripe_locks_root->lock before calling this
+- * function
+- */
+-static struct full_stripe_lock *insert_full_stripe_lock(
+-		struct btrfs_full_stripe_locks_tree *locks_root,
+-		u64 fstripe_logical)
+-{
+-	struct rb_node **p;
+-	struct rb_node *parent = NULL;
+-	struct full_stripe_lock *entry;
+-	struct full_stripe_lock *ret;
+-
+-	lockdep_assert_held(&locks_root->lock);
+-
+-	p = &locks_root->root.rb_node;
+-	while (*p) {
+-		parent = *p;
+-		entry = rb_entry(parent, struct full_stripe_lock, node);
+-		if (fstripe_logical < entry->logical) {
+-			p = &(*p)->rb_left;
+-		} else if (fstripe_logical > entry->logical) {
+-			p = &(*p)->rb_right;
+-		} else {
+-			entry->refs++;
+-			return entry;
+-		}
+-	}
+-
+-	/*
+-	 * Insert new lock.
+-	 */
+-	ret = kmalloc(sizeof(*ret), GFP_KERNEL);
+-	if (!ret)
+-		return ERR_PTR(-ENOMEM);
+-	ret->logical = fstripe_logical;
+-	ret->refs = 1;
+-	mutex_init(&ret->mutex);
+-
+-	rb_link_node(&ret->node, parent, p);
+-	rb_insert_color(&ret->node, &locks_root->root);
+-	return ret;
+-}
+-
+-/*
+- * Search for a full stripe lock of a block group
+- *
+- * Return pointer to existing full stripe lock if found
+- * Return NULL if not found
+- */
+-static struct full_stripe_lock *search_full_stripe_lock(
+-		struct btrfs_full_stripe_locks_tree *locks_root,
+-		u64 fstripe_logical)
+-{
+-	struct rb_node *node;
+-	struct full_stripe_lock *entry;
+-
+-	lockdep_assert_held(&locks_root->lock);
+-
+-	node = locks_root->root.rb_node;
+-	while (node) {
+-		entry = rb_entry(node, struct full_stripe_lock, node);
+-		if (fstripe_logical < entry->logical)
+-			node = node->rb_left;
+-		else if (fstripe_logical > entry->logical)
+-			node = node->rb_right;
+-		else
+-			return entry;
+-	}
+-	return NULL;
+-}
+-
+-/*
+- * Helper to get full stripe logical from a normal bytenr.
+- *
+- * Caller must ensure @cache is a RAID56 block group.
+- */
+-static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
+-{
+-	u64 ret;
+-
+-	/*
+-	 * Due to chunk item size limit, full stripe length should not be
+-	 * larger than U32_MAX. Just a sanity check here.
+-	 */
+-	WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
+-
+-	/*
+-	 * round_down() can only handle power of 2, while RAID56 full
+-	 * stripe length can be 64KiB * n, so we need to manually round down.
+-	 */
+-	ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
+-			cache->full_stripe_len + cache->start;
+-	return ret;
+-}
+-
+-/*
+- * Lock a full stripe to avoid concurrency of recovery and read
+- *
+- * It's only used for profiles with parities (RAID5/6), for other profiles it
+- * does nothing.
+- *
+- * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
+- * So caller must call unlock_full_stripe() at the same context.
+- *
+- * Return <0 if encounters error.
+- */
+-static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+-			    bool *locked_ret)
+-{
+-	struct btrfs_block_group *bg_cache;
+-	struct btrfs_full_stripe_locks_tree *locks_root;
+-	struct full_stripe_lock *existing;
+-	u64 fstripe_start;
+-	int ret = 0;
+-
+-	*locked_ret = false;
+-	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+-	if (!bg_cache) {
+-		ASSERT(0);
+-		return -ENOENT;
+-	}
+-
+-	/* Profiles not based on parity don't need full stripe lock */
+-	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+-		goto out;
+-	locks_root = &bg_cache->full_stripe_locks_root;
+-
+-	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+-
+-	/* Now insert the full stripe lock */
+-	mutex_lock(&locks_root->lock);
+-	existing = insert_full_stripe_lock(locks_root, fstripe_start);
+-	mutex_unlock(&locks_root->lock);
+-	if (IS_ERR(existing)) {
+-		ret = PTR_ERR(existing);
+-		goto out;
+-	}
+-	mutex_lock(&existing->mutex);
+-	*locked_ret = true;
+-out:
+-	btrfs_put_block_group(bg_cache);
+-	return ret;
+-}
+-
+-/*
+- * Unlock a full stripe.
+- *
+- * NOTE: Caller must ensure it's the same context calling corresponding
+- * lock_full_stripe().
+- *
+- * Return 0 if we unlock full stripe without problem.
+- * Return <0 for error
+- */
+-static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
+-			      bool locked)
+-{
+-	struct btrfs_block_group *bg_cache;
+-	struct btrfs_full_stripe_locks_tree *locks_root;
+-	struct full_stripe_lock *fstripe_lock;
+-	u64 fstripe_start;
+-	bool freeit = false;
+-	int ret = 0;
+-
+-	/* If we didn't acquire full stripe lock, no need to continue */
+-	if (!locked)
+-		return 0;
+-
+-	bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
+-	if (!bg_cache) {
+-		ASSERT(0);
+-		return -ENOENT;
+-	}
+-	if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
+-		goto out;
+-
+-	locks_root = &bg_cache->full_stripe_locks_root;
+-	fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
+-
+-	mutex_lock(&locks_root->lock);
+-	fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
+-	/* Unpaired unlock_full_stripe() detected */
+-	if (!fstripe_lock) {
+-		WARN_ON(1);
+-		ret = -ENOENT;
+-		mutex_unlock(&locks_root->lock);
+-		goto out;
+-	}
+-
+-	if (fstripe_lock->refs == 0) {
+-		WARN_ON(1);
+-		btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
+-			fstripe_lock->logical);
+-	} else {
+-		fstripe_lock->refs--;
+-	}
+-
+-	if (fstripe_lock->refs == 0) {
+-		rb_erase(&fstripe_lock->node, &locks_root->root);
+-		freeit = true;
+-	}
+-	mutex_unlock(&locks_root->lock);
+-
+-	mutex_unlock(&fstripe_lock->mutex);
+-	if (freeit)
+-		kfree(fstripe_lock);
+-out:
+-	btrfs_put_block_group(bg_cache);
+-	return ret;
+-}
+-
+ static void scrub_free_csums(struct scrub_ctx *sctx)
+ {
+ 	while (!list_empty(&sctx->csum_list)) {
+@@ -721,24 +321,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
+ 	if (!sctx)
+ 		return;
+ 
+-	/* this can happen when scrub is cancelled */
+-	if (sctx->curr != -1) {
+-		struct scrub_bio *sbio = sctx->bios[sctx->curr];
+-
+-		for (i = 0; i < sbio->sector_count; i++)
+-			scrub_block_put(sbio->sectors[i]->sblock);
+-		bio_put(sbio->bio);
+-	}
+-
+-	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+-		struct scrub_bio *sbio = sctx->bios[i];
+-
+-		if (!sbio)
+-			break;
+-		kfree(sbio);
+-	}
++	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++)
++		release_scrub_stripe(&sctx->stripes[i]);
+ 
+-	kfree(sctx->wr_curr_bio);
+ 	scrub_free_csums(sctx);
+ 	kfree(sctx);
+ }
+@@ -760,45 +345,26 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
+ 		goto nomem;
+ 	refcount_set(&sctx->refs, 1);
+ 	sctx->is_dev_replace = is_dev_replace;
+-	sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
+-	sctx->curr = -1;
+ 	sctx->fs_info = fs_info;
+ 	INIT_LIST_HEAD(&sctx->csum_list);
+-	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+-		struct scrub_bio *sbio;
++	for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) {
++		int ret;
+ 
+-		sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
+-		if (!sbio)
++		ret = init_scrub_stripe(fs_info, &sctx->stripes[i]);
++		if (ret < 0)
+ 			goto nomem;
+-		sctx->bios[i] = sbio;
+-
+-		sbio->index = i;
+-		sbio->sctx = sctx;
+-		sbio->sector_count = 0;
+-		INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
+-
+-		if (i != SCRUB_BIOS_PER_SCTX - 1)
+-			sctx->bios[i]->next_free = i + 1;
+-		else
+-			sctx->bios[i]->next_free = -1;
++		sctx->stripes[i].sctx = sctx;
+ 	}
+ 	sctx->first_free = 0;
+-	atomic_set(&sctx->bios_in_flight, 0);
+-	atomic_set(&sctx->workers_pending, 0);
+ 	atomic_set(&sctx->cancel_req, 0);
+ 
+-	spin_lock_init(&sctx->list_lock);
+ 	spin_lock_init(&sctx->stat_lock);
+-	init_waitqueue_head(&sctx->list_wait);
+ 	sctx->throttle_deadline = 0;
+ 
+-	WARN_ON(sctx->wr_curr_bio != NULL);
+ 	mutex_init(&sctx->wr_lock);
+-	sctx->wr_curr_bio = NULL;
+ 	if (is_dev_replace) {
+ 		WARN_ON(!fs_info->dev_replace.tgtdev);
+ 		sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
+-		sctx->flush_all_writes = false;
+ 	}
+ 
+ 	return sctx;
+@@ -898,10 +464,10 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
+ 	return 0;
+ }
+ 
+-static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
++static void scrub_print_common_warning(const char *errstr, struct btrfs_device *dev,
++				       bool is_super, u64 logical, u64 physical)
+ {
+-	struct btrfs_device *dev;
+-	struct btrfs_fs_info *fs_info;
++	struct btrfs_fs_info *fs_info = dev->fs_info;
+ 	struct btrfs_path *path;
+ 	struct btrfs_key found_key;
+ 	struct extent_buffer *eb;
+@@ -914,22 +480,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+ 	u8 ref_level = 0;
+ 	int ret;
+ 
+-	WARN_ON(sblock->sector_count < 1);
+-	dev = sblock->dev;
+-	fs_info = sblock->sctx->fs_info;
+-
+ 	/* Super block error, no need to search extent tree. */
+-	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
++	if (is_super) {
+ 		btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
+-			errstr, btrfs_dev_name(dev), sblock->physical);
++				  errstr, btrfs_dev_name(dev), physical);
+ 		return;
+ 	}
+ 	path = btrfs_alloc_path();
+ 	if (!path)
+ 		return;
+ 
+-	swarn.physical = sblock->physical;
+-	swarn.logical = sblock->logical;
++	swarn.physical = physical;
++	swarn.logical = logical;
+ 	swarn.errstr = errstr;
+ 	swarn.dev = NULL;
+ 
+@@ -978,1921 +540,717 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
+ 	btrfs_free_path(path);
+ }
+ 
+-static inline void scrub_get_recover(struct scrub_recover *recover)
++static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
+ {
+-	refcount_inc(&recover->refs);
++	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
++		return 2;
++	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
++		return 3;
++	else
++		return (int)bioc->num_stripes;
+ }
+ 
+-static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
+-				     struct scrub_recover *recover)
++static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
++						 u64 full_stripe_logical,
++						 int nstripes, int mirror,
++						 int *stripe_index,
++						 u64 *stripe_offset)
+ {
+-	if (refcount_dec_and_test(&recover->refs)) {
+-		btrfs_bio_counter_dec(fs_info);
+-		btrfs_put_bioc(recover->bioc);
+-		kfree(recover);
++	int i;
++
++	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
++		const int nr_data_stripes = (map_type & BTRFS_BLOCK_GROUP_RAID5) ?
++					    nstripes - 1 : nstripes - 2;
++
++		/* RAID5/6 */
++		for (i = 0; i < nr_data_stripes; i++) {
++			const u64 data_stripe_start = full_stripe_logical +
++						(i * BTRFS_STRIPE_LEN);
++
++			if (logical >= data_stripe_start &&
++			    logical < data_stripe_start + BTRFS_STRIPE_LEN)
++				break;
++		}
++
++		*stripe_index = i;
++		*stripe_offset = (logical - full_stripe_logical) &
++				 BTRFS_STRIPE_LEN_MASK;
++	} else {
++		/* The other RAID type */
++		*stripe_index = mirror;
++		*stripe_offset = 0;
+ 	}
+ }
+ 
+-/*
+- * scrub_handle_errored_block gets called when either verification of the
+- * sectors failed or the bio failed to read, e.g. with EIO. In the latter
+- * case, this function handles all sectors in the bio, even though only one
+- * may be bad.
+- * The goal of this function is to repair the errored block by using the
+- * contents of one of the mirrors.
+- */
+-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
++static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
+ {
+-	struct scrub_ctx *sctx = sblock_to_check->sctx;
+-	struct btrfs_device *dev = sblock_to_check->dev;
+-	struct btrfs_fs_info *fs_info;
+-	u64 logical;
+-	unsigned int failed_mirror_index;
+-	unsigned int is_metadata;
+-	unsigned int have_csum;
+-	/* One scrub_block for each mirror */
+-	struct scrub_block *sblocks_for_recheck[BTRFS_MAX_MIRRORS] = { 0 };
+-	struct scrub_block *sblock_bad;
+-	int ret;
+-	int mirror_index;
+-	int sector_num;
+-	int success;
+-	bool full_stripe_locked;
+-	unsigned int nofs_flag;
+-	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+-				      DEFAULT_RATELIMIT_BURST);
++	int ret = 0;
++	u64 length;
+ 
+-	BUG_ON(sblock_to_check->sector_count < 1);
+-	fs_info = sctx->fs_info;
+-	if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+-		/*
+-		 * If we find an error in a super block, we just report it.
+-		 * They will get written with the next transaction commit
+-		 * anyway
+-		 */
+-		scrub_print_warning("super block error", sblock_to_check);
+-		spin_lock(&sctx->stat_lock);
+-		++sctx->stat.super_errors;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
++	if (!btrfs_is_zoned(sctx->fs_info))
+ 		return 0;
+-	}
+-	logical = sblock_to_check->logical;
+-	ASSERT(sblock_to_check->mirror_num);
+-	failed_mirror_index = sblock_to_check->mirror_num - 1;
+-	is_metadata = !(sblock_to_check->sectors[0]->flags &
+-			BTRFS_EXTENT_FLAG_DATA);
+-	have_csum = sblock_to_check->sectors[0]->have_csum;
+-
+-	if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
++
++	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+ 		return 0;
+ 
+-	/*
+-	 * We must use GFP_NOFS because the scrub task might be waiting for a
+-	 * worker task executing this function and in turn a transaction commit
+-	 * might be waiting the scrub task to pause (which needs to wait for all
+-	 * the worker tasks to complete before pausing).
+-	 * We do allocations in the workers through insert_full_stripe_lock()
+-	 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
+-	 * this function.
+-	 */
+-	nofs_flag = memalloc_nofs_save();
+-	/*
+-	 * For RAID5/6, race can happen for a different device scrub thread.
+-	 * For data corruption, Parity and Data threads will both try
+-	 * to recovery the data.
+-	 * Race can lead to doubly added csum error, or even unrecoverable
+-	 * error.
+-	 */
+-	ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
+-	if (ret < 0) {
+-		memalloc_nofs_restore(nofs_flag);
+-		spin_lock(&sctx->stat_lock);
+-		if (ret == -ENOMEM)
+-			sctx->stat.malloc_errors++;
+-		sctx->stat.read_errors++;
+-		sctx->stat.uncorrectable_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		return ret;
++	if (sctx->write_pointer < physical) {
++		length = physical - sctx->write_pointer;
++
++		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
++						sctx->write_pointer, length);
++		if (!ret)
++			sctx->write_pointer = physical;
+ 	}
++	return ret;
++}
+ 
+-	/*
+-	 * read all mirrors one after the other. This includes to
+-	 * re-read the extent or metadata block that failed (that was
+-	 * the cause that this fixup code is called) another time,
+-	 * sector by sector this time in order to know which sectors
+-	 * caused I/O errors and which ones are good (for all mirrors).
+-	 * It is the goal to handle the situation when more than one
+-	 * mirror contains I/O errors, but the errors do not
+-	 * overlap, i.e. the data can be repaired by selecting the
+-	 * sectors from those mirrors without I/O error on the
+-	 * particular sectors. One example (with blocks >= 2 * sectorsize)
+-	 * would be that mirror #1 has an I/O error on the first sector,
+-	 * the second sector is good, and mirror #2 has an I/O error on
+-	 * the second sector, but the first sector is good.
+-	 * Then the first sector of the first mirror can be repaired by
+-	 * taking the first sector of the second mirror, and the
+-	 * second sector of the second mirror can be repaired by
+-	 * copying the contents of the 2nd sector of the 1st mirror.
+-	 * One more note: if the sectors of one mirror contain I/O
+-	 * errors, the checksum cannot be verified. In order to get
+-	 * the best data for repairing, the first attempt is to find
+-	 * a mirror without I/O errors and with a validated checksum.
+-	 * Only if this is not possible, the sectors are picked from
+-	 * mirrors with I/O errors without considering the checksum.
+-	 * If the latter is the case, at the end, the checksum of the
+-	 * repaired area is verified in order to correctly maintain
+-	 * the statistics.
+-	 */
+-	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+-		/*
+-		 * Note: the two members refs and outstanding_sectors are not
+-		 * used in the blocks that are used for the recheck procedure.
+-		 *
+-		 * But alloc_scrub_block() will initialize sblock::ref anyway,
+-		 * so we can use scrub_block_put() to clean them up.
+-		 *
+-		 * And here we don't setup the physical/dev for the sblock yet,
+-		 * they will be correctly initialized in scrub_setup_recheck_block().
+-		 */
+-		sblocks_for_recheck[mirror_index] = alloc_scrub_block(sctx, NULL,
+-							logical, 0, 0, mirror_index);
+-		if (!sblocks_for_recheck[mirror_index]) {
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.malloc_errors++;
+-			sctx->stat.read_errors++;
+-			sctx->stat.uncorrectable_errors++;
+-			spin_unlock(&sctx->stat_lock);
+-			btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+-			goto out;
+-		}
+-	}
+-
+-	/* Setup the context, map the logical blocks and alloc the sectors */
+-	ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
+-	if (ret) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.read_errors++;
+-		sctx->stat.uncorrectable_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+-		goto out;
+-	}
+-	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+-	sblock_bad = sblocks_for_recheck[failed_mirror_index];
+-
+-	/* build and submit the bios for the failed mirror, check checksums */
+-	scrub_recheck_block(fs_info, sblock_bad, 1);
+-
+-	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+-	    sblock_bad->no_io_error_seen) {
+-		/*
+-		 * The error disappeared after reading sector by sector, or
+-		 * the area was part of a huge bio and other parts of the
+-		 * bio caused I/O errors, or the block layer merged several
+-		 * read requests into one and the error is caused by a
+-		 * different bio (usually one of the two latter cases is
+-		 * the cause)
+-		 */
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.unverified_errors++;
+-		sblock_to_check->data_corrected = 1;
+-		spin_unlock(&sctx->stat_lock);
+-
+-		if (sctx->is_dev_replace)
+-			scrub_write_block_to_dev_replace(sblock_bad);
+-		goto out;
+-	}
+-
+-	if (!sblock_bad->no_io_error_seen) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.read_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		if (__ratelimit(&rs))
+-			scrub_print_warning("i/o error", sblock_to_check);
+-		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+-	} else if (sblock_bad->checksum_error) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.csum_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		if (__ratelimit(&rs))
+-			scrub_print_warning("checksum error", sblock_to_check);
+-		btrfs_dev_stat_inc_and_print(dev,
+-					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
+-	} else if (sblock_bad->header_error) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.verify_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		if (__ratelimit(&rs))
+-			scrub_print_warning("checksum/header error",
+-					    sblock_to_check);
+-		if (sblock_bad->generation_error)
+-			btrfs_dev_stat_inc_and_print(dev,
+-				BTRFS_DEV_STAT_GENERATION_ERRS);
+-		else
+-			btrfs_dev_stat_inc_and_print(dev,
+-				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+-	}
+-
+-	if (sctx->readonly) {
+-		ASSERT(!sctx->is_dev_replace);
+-		goto out;
+-	}
+-
+-	/*
+-	 * now build and submit the bios for the other mirrors, check
+-	 * checksums.
+-	 * First try to pick the mirror which is completely without I/O
+-	 * errors and also does not have a checksum error.
+-	 * If one is found, and if a checksum is present, the full block
+-	 * that is known to contain an error is rewritten. Afterwards
+-	 * the block is known to be corrected.
+-	 * If a mirror is found which is completely correct, and no
+-	 * checksum is present, only those sectors are rewritten that had
+-	 * an I/O error in the block to be repaired, since it cannot be
+-	 * determined, which copy of the other sectors is better (and it
+-	 * could happen otherwise that a correct sector would be
+-	 * overwritten by a bad one).
+-	 */
+-	for (mirror_index = 0; ;mirror_index++) {
+-		struct scrub_block *sblock_other;
+-
+-		if (mirror_index == failed_mirror_index)
+-			continue;
+-
+-		/* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
+-		if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
+-			if (mirror_index >= BTRFS_MAX_MIRRORS)
+-				break;
+-			if (!sblocks_for_recheck[mirror_index]->sector_count)
+-				break;
+-
+-			sblock_other = sblocks_for_recheck[mirror_index];
+-		} else {
+-			struct scrub_recover *r = sblock_bad->sectors[0]->recover;
+-			int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
+-
+-			if (mirror_index >= max_allowed)
+-				break;
+-			if (!sblocks_for_recheck[1]->sector_count)
+-				break;
+-
+-			ASSERT(failed_mirror_index == 0);
+-			sblock_other = sblocks_for_recheck[1];
+-			sblock_other->mirror_num = 1 + mirror_index;
+-		}
+-
+-		/* build and submit the bios, check checksums */
+-		scrub_recheck_block(fs_info, sblock_other, 0);
+-
+-		if (!sblock_other->header_error &&
+-		    !sblock_other->checksum_error &&
+-		    sblock_other->no_io_error_seen) {
+-			if (sctx->is_dev_replace) {
+-				scrub_write_block_to_dev_replace(sblock_other);
+-				goto corrected_error;
+-			} else {
+-				ret = scrub_repair_block_from_good_copy(
+-						sblock_bad, sblock_other);
+-				if (!ret)
+-					goto corrected_error;
+-			}
+-		}
+-	}
+-
+-	if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+-		goto did_not_correct_error;
+-
+-	/*
+-	 * In case of I/O errors in the area that is supposed to be
+-	 * repaired, continue by picking good copies of those sectors.
+-	 * Select the good sectors from mirrors to rewrite bad sectors from
+-	 * the area to fix. Afterwards verify the checksum of the block
+-	 * that is supposed to be repaired. This verification step is
+-	 * only done for the purpose of statistic counting and for the
+-	 * final scrub report, whether errors remain.
+-	 * A perfect algorithm could make use of the checksum and try
+-	 * all possible combinations of sectors from the different mirrors
+-	 * until the checksum verification succeeds. For example, when
+-	 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
+-	 * of mirror #2 is readable but the final checksum test fails,
+-	 * then the 2nd sector of mirror #3 could be tried, whether now
+-	 * the final checksum succeeds. But this would be a rare
+-	 * exception and is therefore not implemented. At least it is
+-	 * avoided that the good copy is overwritten.
+-	 * A more useful improvement would be to pick the sectors
+-	 * without I/O error based on sector sizes (512 bytes on legacy
+-	 * disks) instead of on sectorsize. Then maybe 512 byte of one
+-	 * mirror could be repaired by taking 512 byte of a different
+-	 * mirror, even if other 512 byte sectors in the same sectorsize
+-	 * area are unreadable.
+-	 */
+-	success = 1;
+-	for (sector_num = 0; sector_num < sblock_bad->sector_count;
+-	     sector_num++) {
+-		struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+-		struct scrub_block *sblock_other = NULL;
+-
+-		/* Skip no-io-error sectors in scrub */
+-		if (!sector_bad->io_error && !sctx->is_dev_replace)
+-			continue;
+-
+-		if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
+-			/*
+-			 * In case of dev replace, if raid56 rebuild process
+-			 * didn't work out correct data, then copy the content
+-			 * in sblock_bad to make sure target device is identical
+-			 * to source device, instead of writing garbage data in
+-			 * sblock_for_recheck array to target device.
+-			 */
+-			sblock_other = NULL;
+-		} else if (sector_bad->io_error) {
+-			/* Try to find no-io-error sector in mirrors */
+-			for (mirror_index = 0;
+-			     mirror_index < BTRFS_MAX_MIRRORS &&
+-			     sblocks_for_recheck[mirror_index]->sector_count > 0;
+-			     mirror_index++) {
+-				if (!sblocks_for_recheck[mirror_index]->
+-				    sectors[sector_num]->io_error) {
+-					sblock_other = sblocks_for_recheck[mirror_index];
+-					break;
+-				}
+-			}
+-			if (!sblock_other)
+-				success = 0;
+-		}
+-
+-		if (sctx->is_dev_replace) {
+-			/*
+-			 * Did not find a mirror to fetch the sector from.
+-			 * scrub_write_sector_to_dev_replace() handles this
+-			 * case (sector->io_error), by filling the block with
+-			 * zeros before submitting the write request
+-			 */
+-			if (!sblock_other)
+-				sblock_other = sblock_bad;
+-
+-			if (scrub_write_sector_to_dev_replace(sblock_other,
+-							      sector_num) != 0) {
+-				atomic64_inc(
+-					&fs_info->dev_replace.num_write_errors);
+-				success = 0;
+-			}
+-		} else if (sblock_other) {
+-			ret = scrub_repair_sector_from_good_copy(sblock_bad,
+-								 sblock_other,
+-								 sector_num, 0);
+-			if (0 == ret)
+-				sector_bad->io_error = 0;
+-			else
+-				success = 0;
+-		}
+-	}
+-
+-	if (success && !sctx->is_dev_replace) {
+-		if (is_metadata || have_csum) {
+-			/*
+-			 * need to verify the checksum now that all
+-			 * sectors on disk are repaired (the write
+-			 * request for data to be repaired is on its way).
+-			 * Just be lazy and use scrub_recheck_block()
+-			 * which re-reads the data before the checksum
+-			 * is verified, but most likely the data comes out
+-			 * of the page cache.
+-			 */
+-			scrub_recheck_block(fs_info, sblock_bad, 1);
+-			if (!sblock_bad->header_error &&
+-			    !sblock_bad->checksum_error &&
+-			    sblock_bad->no_io_error_seen)
+-				goto corrected_error;
+-			else
+-				goto did_not_correct_error;
+-		} else {
+-corrected_error:
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.corrected_errors++;
+-			sblock_to_check->data_corrected = 1;
+-			spin_unlock(&sctx->stat_lock);
+-			btrfs_err_rl_in_rcu(fs_info,
+-				"fixed up error at logical %llu on dev %s",
+-				logical, btrfs_dev_name(dev));
+-		}
+-	} else {
+-did_not_correct_error:
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.uncorrectable_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_err_rl_in_rcu(fs_info,
+-			"unable to fixup (regular) error at logical %llu on dev %s",
+-			logical, btrfs_dev_name(dev));
+-	}
+-
+-out:
+-	for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS; mirror_index++) {
+-		struct scrub_block *sblock = sblocks_for_recheck[mirror_index];
+-		struct scrub_recover *recover;
+-		int sector_index;
+-
+-		/* Not allocated, continue checking the next mirror */
+-		if (!sblock)
+-			continue;
+-
+-		for (sector_index = 0; sector_index < sblock->sector_count;
+-		     sector_index++) {
+-			/*
+-			 * Here we just cleanup the recover, each sector will be
+-			 * properly cleaned up by later scrub_block_put()
+-			 */
+-			recover = sblock->sectors[sector_index]->recover;
+-			if (recover) {
+-				scrub_put_recover(fs_info, recover);
+-				sblock->sectors[sector_index]->recover = NULL;
+-			}
+-		}
+-		scrub_block_put(sblock);
+-	}
+-
+-	ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
+-	memalloc_nofs_restore(nofs_flag);
+-	if (ret < 0)
+-		return ret;
+-	return 0;
+-}
+-
+-static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
++static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr)
+ {
+-	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
+-		return 2;
+-	else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
+-		return 3;
+-	else
+-		return (int)bioc->num_stripes;
+-}
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT;
+ 
+-static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+-						 u64 *raid_map,
+-						 int nstripes, int mirror,
+-						 int *stripe_index,
+-						 u64 *stripe_offset)
+-{
+-	int i;
+-
+-	if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+-		/* RAID5/6 */
+-		for (i = 0; i < nstripes; i++) {
+-			if (raid_map[i] == RAID6_Q_STRIPE ||
+-			    raid_map[i] == RAID5_P_STRIPE)
+-				continue;
+-
+-			if (logical >= raid_map[i] &&
+-			    logical < raid_map[i] + BTRFS_STRIPE_LEN)
+-				break;
+-		}
+-
+-		*stripe_index = i;
+-		*stripe_offset = logical - raid_map[i];
+-	} else {
+-		/* The other RAID type */
+-		*stripe_index = mirror;
+-		*stripe_offset = 0;
+-	}
++	return stripe->pages[page_index];
+ }
+ 
+-static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
+-				     struct scrub_block *sblocks_for_recheck[])
++static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe,
++						 int sector_nr)
+ {
+-	struct scrub_ctx *sctx = original_sblock->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	u64 logical = original_sblock->logical;
+-	u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
+-	u64 generation = original_sblock->sectors[0]->generation;
+-	u64 flags = original_sblock->sectors[0]->flags;
+-	u64 have_csum = original_sblock->sectors[0]->have_csum;
+-	struct scrub_recover *recover;
+-	struct btrfs_io_context *bioc;
+-	u64 sublen;
+-	u64 mapped_length;
+-	u64 stripe_offset;
+-	int stripe_index;
+-	int sector_index = 0;
+-	int mirror_index;
+-	int nmirrors;
+-	int ret;
+-
+-	while (length > 0) {
+-		sublen = min_t(u64, length, fs_info->sectorsize);
+-		mapped_length = sublen;
+-		bioc = NULL;
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
+ 
+-		/*
+-		 * With a length of sectorsize, each returned stripe represents
+-		 * one mirror
+-		 */
+-		btrfs_bio_counter_inc_blocked(fs_info);
+-		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+-				       logical, &mapped_length, &bioc);
+-		if (ret || !bioc || mapped_length < sublen) {
+-			btrfs_put_bioc(bioc);
+-			btrfs_bio_counter_dec(fs_info);
+-			return -EIO;
+-		}
+-
+-		recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
+-		if (!recover) {
+-			btrfs_put_bioc(bioc);
+-			btrfs_bio_counter_dec(fs_info);
+-			return -ENOMEM;
+-		}
+-
+-		refcount_set(&recover->refs, 1);
+-		recover->bioc = bioc;
+-		recover->map_length = mapped_length;
+-
+-		ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
+-
+-		nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
+-
+-		for (mirror_index = 0; mirror_index < nmirrors;
+-		     mirror_index++) {
+-			struct scrub_block *sblock;
+-			struct scrub_sector *sector;
+-
+-			sblock = sblocks_for_recheck[mirror_index];
+-			sblock->sctx = sctx;
+-
+-			sector = alloc_scrub_sector(sblock, logical);
+-			if (!sector) {
+-				spin_lock(&sctx->stat_lock);
+-				sctx->stat.malloc_errors++;
+-				spin_unlock(&sctx->stat_lock);
+-				scrub_put_recover(fs_info, recover);
+-				return -ENOMEM;
+-			}
+-			sector->flags = flags;
+-			sector->generation = generation;
+-			sector->have_csum = have_csum;
+-			if (have_csum)
+-				memcpy(sector->csum,
+-				       original_sblock->sectors[0]->csum,
+-				       sctx->fs_info->csum_size);
+-
+-			scrub_stripe_index_and_offset(logical,
+-						      bioc->map_type,
+-						      bioc->raid_map,
+-						      bioc->num_stripes -
+-						      bioc->num_tgtdevs,
+-						      mirror_index,
+-						      &stripe_index,
+-						      &stripe_offset);
+-			/*
+-			 * We're at the first sector, also populate @sblock
+-			 * physical and dev.
+-			 */
+-			if (sector_index == 0) {
+-				sblock->physical =
+-					bioc->stripes[stripe_index].physical +
+-					stripe_offset;
+-				sblock->dev = bioc->stripes[stripe_index].dev;
+-				sblock->physical_for_dev_replace =
+-					original_sblock->physical_for_dev_replace;
+-			}
+-
+-			BUG_ON(sector_index >= original_sblock->sector_count);
+-			scrub_get_recover(recover);
+-			sector->recover = recover;
+-		}
+-		scrub_put_recover(fs_info, recover);
+-		length -= sublen;
+-		logical += sublen;
+-		sector_index++;
+-	}
+-
+-	return 0;
+-}
+-
+-static void scrub_bio_wait_endio(struct bio *bio)
+-{
+-	complete(bio->bi_private);
++	return offset_in_page(sector_nr << fs_info->sectorsize_bits);
+ }
+ 
+-static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
+-					struct bio *bio,
+-					struct scrub_sector *sector)
++static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr)
+ {
+-	DECLARE_COMPLETION_ONSTACK(done);
+-
+-	bio->bi_iter.bi_sector = (sector->offset + sector->sblock->logical) >>
+-				 SECTOR_SHIFT;
+-	bio->bi_private = &done;
+-	bio->bi_end_io = scrub_bio_wait_endio;
+-	raid56_parity_recover(bio, sector->recover->bioc, sector->sblock->mirror_num);
+-
+-	wait_for_completion_io(&done);
+-	return blk_status_to_errno(bio->bi_status);
+-}
+-
+-static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
+-					  struct scrub_block *sblock)
+-{
+-	struct scrub_sector *first_sector = sblock->sectors[0];
+-	struct bio *bio;
+-	int i;
+-
+-	/* All sectors in sblock belong to the same stripe on the same device. */
+-	ASSERT(sblock->dev);
+-	if (!sblock->dev->bdev)
+-		goto out;
+-
+-	bio = bio_alloc(sblock->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+-
+-	for (i = 0; i < sblock->sector_count; i++) {
+-		struct scrub_sector *sector = sblock->sectors[i];
+-
+-		bio_add_scrub_sector(bio, sector, fs_info->sectorsize);
+-	}
+-
+-	if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
+-		bio_put(bio);
+-		goto out;
+-	}
+-
+-	bio_put(bio);
+-
+-	scrub_recheck_block_checksum(sblock);
+-
+-	return;
+-out:
+-	for (i = 0; i < sblock->sector_count; i++)
+-		sblock->sectors[i]->io_error = 1;
+-
+-	sblock->no_io_error_seen = 0;
+-}
+-
+-/*
+- * This function will check the on disk data for checksum errors, header errors
+- * and read I/O errors. If any I/O errors happen, the exact sectors which are
+- * errored are marked as being bad. The goal is to enable scrub to take those
+- * sectors that are not errored from all the mirrors so that the sectors that
+- * are errored in the just handled mirror can be repaired.
+- */
+-static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+-				struct scrub_block *sblock,
+-				int retry_failed_mirror)
+-{
+-	int i;
+-
+-	sblock->no_io_error_seen = 1;
+-
+-	/* short cut for raid56 */
+-	if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
+-		return scrub_recheck_block_on_raid56(fs_info, sblock);
+-
+-	for (i = 0; i < sblock->sector_count; i++) {
+-		struct scrub_sector *sector = sblock->sectors[i];
+-		struct bio bio;
+-		struct bio_vec bvec;
+-
+-		if (sblock->dev->bdev == NULL) {
+-			sector->io_error = 1;
+-			sblock->no_io_error_seen = 0;
+-			continue;
+-		}
+-
+-		bio_init(&bio, sblock->dev->bdev, &bvec, 1, REQ_OP_READ);
+-		bio_add_scrub_sector(&bio, sector, fs_info->sectorsize);
+-		bio.bi_iter.bi_sector = (sblock->physical + sector->offset) >>
+-					SECTOR_SHIFT;
+-
+-		btrfsic_check_bio(&bio);
+-		if (submit_bio_wait(&bio)) {
+-			sector->io_error = 1;
+-			sblock->no_io_error_seen = 0;
+-		}
+-
+-		bio_uninit(&bio);
+-	}
+-
+-	if (sblock->no_io_error_seen)
+-		scrub_recheck_block_checksum(sblock);
+-}
+-
+-static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
+-{
+-	struct btrfs_fs_devices *fs_devices = sector->sblock->dev->fs_devices;
+-	int ret;
+-
+-	ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+-	return !ret;
+-}
+-
+-static void scrub_recheck_block_checksum(struct scrub_block *sblock)
+-{
+-	sblock->header_error = 0;
+-	sblock->checksum_error = 0;
+-	sblock->generation_error = 0;
+-
+-	if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+-		scrub_checksum_data(sblock);
+-	else
+-		scrub_checksum_tree_block(sblock);
+-}
+-
+-static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+-					     struct scrub_block *sblock_good)
+-{
+-	int i;
+-	int ret = 0;
+-
+-	for (i = 0; i < sblock_bad->sector_count; i++) {
+-		int ret_sub;
+-
+-		ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
+-							     sblock_good, i, 1);
+-		if (ret_sub)
+-			ret = ret_sub;
+-	}
+-
+-	return ret;
+-}
+-
+-static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
+-					      struct scrub_block *sblock_good,
+-					      int sector_num, int force_write)
+-{
+-	struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
+-	struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
+-	struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
+-	const u32 sectorsize = fs_info->sectorsize;
+-
+-	if (force_write || sblock_bad->header_error ||
+-	    sblock_bad->checksum_error || sector_bad->io_error) {
+-		struct bio bio;
+-		struct bio_vec bvec;
+-		int ret;
+-
+-		if (!sblock_bad->dev->bdev) {
+-			btrfs_warn_rl(fs_info,
+-				"scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
+-			return -EIO;
+-		}
+-
+-		bio_init(&bio, sblock_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
+-		bio.bi_iter.bi_sector = (sblock_bad->physical +
+-					 sector_bad->offset) >> SECTOR_SHIFT;
+-		ret = bio_add_scrub_sector(&bio, sector_good, sectorsize);
+-
+-		btrfsic_check_bio(&bio);
+-		ret = submit_bio_wait(&bio);
+-		bio_uninit(&bio);
+-
+-		if (ret) {
+-			btrfs_dev_stat_inc_and_print(sblock_bad->dev,
+-				BTRFS_DEV_STAT_WRITE_ERRS);
+-			atomic64_inc(&fs_info->dev_replace.num_write_errors);
+-			return -EIO;
+-		}
+-	}
+-
+-	return 0;
+-}
+-
+-static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+-{
+-	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+-	int i;
+-
+-	/*
+-	 * This block is used for the check of the parity on the source device,
+-	 * so the data needn't be written into the destination device.
+-	 */
+-	if (sblock->sparity)
+-		return;
+-
+-	for (i = 0; i < sblock->sector_count; i++) {
+-		int ret;
+-
+-		ret = scrub_write_sector_to_dev_replace(sblock, i);
+-		if (ret)
+-			atomic64_inc(&fs_info->dev_replace.num_write_errors);
+-	}
+-}
+-
+-static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
+-{
+-	const u32 sectorsize = sblock->sctx->fs_info->sectorsize;
+-	struct scrub_sector *sector = sblock->sectors[sector_num];
+-
+-	if (sector->io_error)
+-		memset(scrub_sector_get_kaddr(sector), 0, sectorsize);
+-
+-	return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
+-}
+-
+-static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
+-{
+-	int ret = 0;
+-	u64 length;
+-
+-	if (!btrfs_is_zoned(sctx->fs_info))
+-		return 0;
+-
+-	if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
+-		return 0;
+-
+-	if (sctx->write_pointer < physical) {
+-		length = physical - sctx->write_pointer;
+-
+-		ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
+-						sctx->write_pointer, length);
+-		if (!ret)
+-			sctx->write_pointer = physical;
+-	}
+-	return ret;
+-}
+-
+-static void scrub_block_get(struct scrub_block *sblock)
+-{
+-	refcount_inc(&sblock->refs);
+-}
+-
+-static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
+-				      struct scrub_sector *sector)
+-{
+-	struct scrub_block *sblock = sector->sblock;
+-	struct scrub_bio *sbio;
+-	int ret;
+-	const u32 sectorsize = sctx->fs_info->sectorsize;
+-
+-	mutex_lock(&sctx->wr_lock);
+-again:
+-	if (!sctx->wr_curr_bio) {
+-		sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
+-					      GFP_KERNEL);
+-		if (!sctx->wr_curr_bio) {
+-			mutex_unlock(&sctx->wr_lock);
+-			return -ENOMEM;
+-		}
+-		sctx->wr_curr_bio->sctx = sctx;
+-		sctx->wr_curr_bio->sector_count = 0;
+-	}
+-	sbio = sctx->wr_curr_bio;
+-	if (sbio->sector_count == 0) {
+-		ret = fill_writer_pointer_gap(sctx, sector->offset +
+-					      sblock->physical_for_dev_replace);
+-		if (ret) {
+-			mutex_unlock(&sctx->wr_lock);
+-			return ret;
+-		}
+-
+-		sbio->physical = sblock->physical_for_dev_replace + sector->offset;
+-		sbio->logical = sblock->logical + sector->offset;
+-		sbio->dev = sctx->wr_tgtdev;
+-		if (!sbio->bio) {
+-			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+-					      REQ_OP_WRITE, GFP_NOFS);
+-		}
+-		sbio->bio->bi_private = sbio;
+-		sbio->bio->bi_end_io = scrub_wr_bio_end_io;
+-		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
+-		sbio->status = 0;
+-	} else if (sbio->physical + sbio->sector_count * sectorsize !=
+-		   sblock->physical_for_dev_replace + sector->offset ||
+-		   sbio->logical + sbio->sector_count * sectorsize !=
+-		   sblock->logical + sector->offset) {
+-		scrub_wr_submit(sctx);
+-		goto again;
+-	}
+-
+-	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+-	if (ret != sectorsize) {
+-		if (sbio->sector_count < 1) {
+-			bio_put(sbio->bio);
+-			sbio->bio = NULL;
+-			mutex_unlock(&sctx->wr_lock);
+-			return -EIO;
+-		}
+-		scrub_wr_submit(sctx);
+-		goto again;
+-	}
+-
+-	sbio->sectors[sbio->sector_count] = sector;
+-	scrub_sector_get(sector);
+-	/*
+-	 * Since ssector no longer holds a page, but uses sblock::pages, we
+-	 * have to ensure the sblock had not been freed before our write bio
+-	 * finished.
+-	 */
+-	scrub_block_get(sector->sblock);
+-
+-	sbio->sector_count++;
+-	if (sbio->sector_count == sctx->sectors_per_bio)
+-		scrub_wr_submit(sctx);
+-	mutex_unlock(&sctx->wr_lock);
+-
+-	return 0;
+-}
+-
+-static void scrub_wr_submit(struct scrub_ctx *sctx)
+-{
+-	struct scrub_bio *sbio;
+-
+-	if (!sctx->wr_curr_bio)
+-		return;
+-
+-	sbio = sctx->wr_curr_bio;
+-	sctx->wr_curr_bio = NULL;
+-	scrub_pending_bio_inc(sctx);
+-	/* process all writes in a single worker thread. Then the block layer
+-	 * orders the requests before sending them to the driver which
+-	 * doubled the write performance on spinning disks when measured
+-	 * with Linux 3.5 */
+-	btrfsic_check_bio(sbio->bio);
+-	submit_bio(sbio->bio);
+-
+-	if (btrfs_is_zoned(sctx->fs_info))
+-		sctx->write_pointer = sbio->physical + sbio->sector_count *
+-			sctx->fs_info->sectorsize;
+-}
+-
+-static void scrub_wr_bio_end_io(struct bio *bio)
+-{
+-	struct scrub_bio *sbio = bio->bi_private;
+-	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
+-
+-	sbio->status = bio->bi_status;
+-	sbio->bio = bio;
+-
+-	INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
+-	queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
+-}
+-
+-static void scrub_wr_bio_end_io_worker(struct work_struct *work)
+-{
+-	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+-	struct scrub_ctx *sctx = sbio->sctx;
+-	int i;
+-
+-	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
+-	if (sbio->status) {
+-		struct btrfs_dev_replace *dev_replace =
+-			&sbio->sctx->fs_info->dev_replace;
+-
+-		for (i = 0; i < sbio->sector_count; i++) {
+-			struct scrub_sector *sector = sbio->sectors[i];
+-
+-			sector->io_error = 1;
+-			atomic64_inc(&dev_replace->num_write_errors);
+-		}
+-	}
+-
+-	/*
+-	 * In scrub_add_sector_to_wr_bio() we grab extra ref for sblock, now in
+-	 * endio we should put the sblock.
+-	 */
+-	for (i = 0; i < sbio->sector_count; i++) {
+-		scrub_block_put(sbio->sectors[i]->sblock);
+-		scrub_sector_put(sbio->sectors[i]);
+-	}
+-
+-	bio_put(sbio->bio);
+-	kfree(sbio);
+-	scrub_pending_bio_dec(sctx);
+-}
+-
+-static int scrub_checksum(struct scrub_block *sblock)
+-{
+-	u64 flags;
+-	int ret;
+-
+-	/*
+-	 * No need to initialize these stats currently,
+-	 * because this function only use return value
+-	 * instead of these stats value.
+-	 *
+-	 * Todo:
+-	 * always use stats
+-	 */
+-	sblock->header_error = 0;
+-	sblock->generation_error = 0;
+-	sblock->checksum_error = 0;
+-
+-	WARN_ON(sblock->sector_count < 1);
+-	flags = sblock->sectors[0]->flags;
+-	ret = 0;
+-	if (flags & BTRFS_EXTENT_FLAG_DATA)
+-		ret = scrub_checksum_data(sblock);
+-	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+-		ret = scrub_checksum_tree_block(sblock);
+-	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+-		ret = scrub_checksum_super(sblock);
+-	else
+-		WARN_ON(1);
+-	if (ret)
+-		scrub_handle_errored_block(sblock);
+-
+-	return ret;
+-}
+-
+-static int scrub_checksum_data(struct scrub_block *sblock)
+-{
+-	struct scrub_ctx *sctx = sblock->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
++	const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits);
++	const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr);
++	const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr);
+ 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+-	u8 csum[BTRFS_CSUM_SIZE];
+-	struct scrub_sector *sector;
+-	char *kaddr;
+-
+-	BUG_ON(sblock->sector_count < 1);
+-	sector = sblock->sectors[0];
+-	if (!sector->have_csum)
+-		return 0;
+-
+-	kaddr = scrub_sector_get_kaddr(sector);
+-
+-	shash->tfm = fs_info->csum_shash;
+-	crypto_shash_init(shash);
+-
+-	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+-
+-	if (memcmp(csum, sector->csum, fs_info->csum_size))
+-		sblock->checksum_error = 1;
+-	return sblock->checksum_error;
+-}
+-
+-static int scrub_checksum_tree_block(struct scrub_block *sblock)
+-{
+-	struct scrub_ctx *sctx = sblock->sctx;
+-	struct btrfs_header *h;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+-	u8 calculated_csum[BTRFS_CSUM_SIZE];
+ 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+-	/*
+-	 * This is done in sectorsize steps even for metadata as there's a
+-	 * constraint for nodesize to be aligned to sectorsize. This will need
+-	 * to change so we don't misuse data and metadata units like that.
+-	 */
+-	const u32 sectorsize = sctx->fs_info->sectorsize;
+-	const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
+-	int i;
+-	struct scrub_sector *sector;
+-	char *kaddr;
+-
+-	BUG_ON(sblock->sector_count < 1);
+-
+-	/* Each member in sectors is just one sector */
+-	ASSERT(sblock->sector_count == num_sectors);
+-
+-	sector = sblock->sectors[0];
+-	kaddr = scrub_sector_get_kaddr(sector);
+-	h = (struct btrfs_header *)kaddr;
+-	memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
++	u8 calculated_csum[BTRFS_CSUM_SIZE];
++	struct btrfs_header *header;
+ 
+ 	/*
+-	 * we don't use the getter functions here, as we
+-	 * a) don't have an extent buffer and
+-	 * b) the page is already kmapped
++	 * Here we don't have a good way to attach the pages (and subpages)
++	 * to a dummy extent buffer, thus we have to directly grab the members
++	 * from pages.
+ 	 */
+-	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
+-		sblock->header_error = 1;
++	header = (struct btrfs_header *)(page_address(first_page) + first_off);
++	memcpy(on_disk_csum, header->csum, fs_info->csum_size);
++
++	if (logical != btrfs_stack_header_bytenr(header)) {
++		bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
++		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ 		btrfs_warn_rl(fs_info,
+ 		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
+-			      sblock->logical, sblock->mirror_num,
+-			      btrfs_stack_header_bytenr(h),
+-			      sblock->logical);
+-		goto out;
++			      logical, stripe->mirror_num,
++			      btrfs_stack_header_bytenr(header), logical);
++		return;
+ 	}
+-
+-	if (!scrub_check_fsid(h->fsid, sector)) {
+-		sblock->header_error = 1;
++	if (memcmp(header->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE) != 0) {
++		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
++		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ 		btrfs_warn_rl(fs_info,
+ 		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
+-			      sblock->logical, sblock->mirror_num,
+-			      h->fsid, sblock->dev->fs_devices->fsid);
+-		goto out;
++			      logical, stripe->mirror_num,
++			      header->fsid, fs_info->fs_devices->fsid);
++		return;
+ 	}
+-
+-	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
+-		sblock->header_error = 1;
++	if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid,
++		   BTRFS_UUID_SIZE) != 0) {
++		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
++		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ 		btrfs_warn_rl(fs_info,
+ 		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
+-			      sblock->logical, sblock->mirror_num,
+-			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
+-		goto out;
++			      logical, stripe->mirror_num,
++			      header->chunk_tree_uuid, fs_info->chunk_tree_uuid);
++		return;
+ 	}
+ 
++	/* Now check tree block csum. */
+ 	shash->tfm = fs_info->csum_shash;
+ 	crypto_shash_init(shash);
+-	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
+-			    sectorsize - BTRFS_CSUM_SIZE);
++	crypto_shash_update(shash, page_address(first_page) + first_off +
++			    BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE);
++
++	for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) {
++		struct page *page = scrub_stripe_get_page(stripe, i);
++		unsigned int page_off = scrub_stripe_get_page_offset(stripe, i);
+ 
+-	for (i = 1; i < num_sectors; i++) {
+-		kaddr = scrub_sector_get_kaddr(sblock->sectors[i]);
+-		crypto_shash_update(shash, kaddr, sectorsize);
++		crypto_shash_update(shash, page_address(page) + page_off,
++				    fs_info->sectorsize);
+ 	}
+ 
+ 	crypto_shash_final(shash, calculated_csum);
+-	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
+-		sblock->checksum_error = 1;
++	if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) {
++		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
++		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ 		btrfs_warn_rl(fs_info,
+ 		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
+-			      sblock->logical, sblock->mirror_num,
++			      logical, stripe->mirror_num,
+ 			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
+ 			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
+-		goto out;
++		return;
+ 	}
+-
+-	if (sector->generation != btrfs_stack_header_generation(h)) {
+-		sblock->header_error = 1;
+-		sblock->generation_error = 1;
++	if (stripe->sectors[sector_nr].generation !=
++	    btrfs_stack_header_generation(header)) {
++		bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
++		bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree);
+ 		btrfs_warn_rl(fs_info,
+ 		"tree block %llu mirror %u has bad generation, has %llu want %llu",
+-			      sblock->logical, sblock->mirror_num,
+-			      btrfs_stack_header_generation(h),
+-			      sector->generation);
+-	}
+-
+-out:
+-	return sblock->header_error || sblock->checksum_error;
+-}
+-
+-static int scrub_checksum_super(struct scrub_block *sblock)
+-{
+-	struct btrfs_super_block *s;
+-	struct scrub_ctx *sctx = sblock->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+-	u8 calculated_csum[BTRFS_CSUM_SIZE];
+-	struct scrub_sector *sector;
+-	char *kaddr;
+-	int fail_gen = 0;
+-	int fail_cor = 0;
+-
+-	BUG_ON(sblock->sector_count < 1);
+-	sector = sblock->sectors[0];
+-	kaddr = scrub_sector_get_kaddr(sector);
+-	s = (struct btrfs_super_block *)kaddr;
+-
+-	if (sblock->logical != btrfs_super_bytenr(s))
+-		++fail_cor;
+-
+-	if (sector->generation != btrfs_super_generation(s))
+-		++fail_gen;
+-
+-	if (!scrub_check_fsid(s->fsid, sector))
+-		++fail_cor;
+-
+-	shash->tfm = fs_info->csum_shash;
+-	crypto_shash_init(shash);
+-	crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
+-			BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
+-
+-	if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
+-		++fail_cor;
+-
+-	return fail_cor + fail_gen;
+-}
+-
+-static void scrub_block_put(struct scrub_block *sblock)
+-{
+-	if (refcount_dec_and_test(&sblock->refs)) {
+-		int i;
+-
+-		if (sblock->sparity)
+-			scrub_parity_put(sblock->sparity);
+-
+-		for (i = 0; i < sblock->sector_count; i++)
+-			scrub_sector_put(sblock->sectors[i]);
+-		for (i = 0; i < DIV_ROUND_UP(sblock->len, PAGE_SIZE); i++) {
+-			if (sblock->pages[i]) {
+-				detach_scrub_page_private(sblock->pages[i]);
+-				__free_page(sblock->pages[i]);
+-			}
+-		}
+-		kfree(sblock);
++			      logical, stripe->mirror_num,
++			      btrfs_stack_header_generation(header),
++			      stripe->sectors[sector_nr].generation);
++		return;
+ 	}
++	bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree);
++	bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree);
++	bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree);
+ }
+ 
+-static void scrub_sector_get(struct scrub_sector *sector)
+-{
+-	atomic_inc(&sector->refs);
+-}
+-
+-static void scrub_sector_put(struct scrub_sector *sector)
++static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
+ {
+-	if (atomic_dec_and_test(&sector->refs))
+-		kfree(sector);
+-}
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	struct scrub_sector_verification *sector = &stripe->sectors[sector_nr];
++	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
++	struct page *page = scrub_stripe_get_page(stripe, sector_nr);
++	unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
++	u8 csum_buf[BTRFS_CSUM_SIZE];
++	int ret;
+ 
+-/*
+- * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
+- * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
+- */
+-static void scrub_throttle(struct scrub_ctx *sctx)
+-{
+-	const int time_slice = 1000;
+-	struct scrub_bio *sbio;
+-	struct btrfs_device *device;
+-	s64 delta;
+-	ktime_t now;
+-	u32 div;
+-	u64 bwlimit;
++	ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors);
+ 
+-	sbio = sctx->bios[sctx->curr];
+-	device = sbio->dev;
+-	bwlimit = READ_ONCE(device->scrub_speed_max);
+-	if (bwlimit == 0)
++	/* Sector not utilized, skip it. */
++	if (!test_bit(sector_nr, &stripe->extent_sector_bitmap))
+ 		return;
+ 
+-	/*
+-	 * Slice is divided into intervals when the IO is submitted, adjust by
+-	 * bwlimit and maximum of 64 intervals.
+-	 */
+-	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
+-	div = min_t(u32, 64, div);
+-
+-	/* Start new epoch, set deadline */
+-	now = ktime_get();
+-	if (sctx->throttle_deadline == 0) {
+-		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
+-		sctx->throttle_sent = 0;
+-	}
++	/* IO error, no need to check. */
++	if (test_bit(sector_nr, &stripe->io_error_bitmap))
++		return;
+ 
+-	/* Still in the time to send? */
+-	if (ktime_before(now, sctx->throttle_deadline)) {
+-		/* If current bio is within the limit, send it */
+-		sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
+-		if (sctx->throttle_sent <= div_u64(bwlimit, div))
++	/* Metadata, verify the full tree block. */
++	if (sector->is_metadata) {
++		/*
++		 * Check if the tree block crosses the stripe boudary.  If
++		 * crossed the boundary, we cannot verify it but only give a
++		 * warning.
++		 *
++		 * This can only happen on a very old filesystem where chunks
++		 * are not ensured to be stripe aligned.
++		 */
++		if (unlikely(sector_nr + sectors_per_tree > stripe->nr_sectors)) {
++			btrfs_warn_rl(fs_info,
++			"tree block at %llu crosses stripe boundary %llu",
++				      stripe->logical +
++				      (sector_nr << fs_info->sectorsize_bits),
++				      stripe->logical);
+ 			return;
+-
+-		/* We're over the limit, sleep until the rest of the slice */
+-		delta = ktime_ms_delta(sctx->throttle_deadline, now);
+-	} else {
+-		/* New request after deadline, start new epoch */
+-		delta = 0;
+-	}
+-
+-	if (delta) {
+-		long timeout;
+-
+-		timeout = div_u64(delta * HZ, 1000);
+-		schedule_timeout_interruptible(timeout);
+-	}
+-
+-	/* Next call will start the deadline period */
+-	sctx->throttle_deadline = 0;
+-}
+-
+-static void scrub_submit(struct scrub_ctx *sctx)
+-{
+-	struct scrub_bio *sbio;
+-
+-	if (sctx->curr == -1)
++		}
++		scrub_verify_one_metadata(stripe, sector_nr);
+ 		return;
++	}
+ 
+-	scrub_throttle(sctx);
+-
+-	sbio = sctx->bios[sctx->curr];
+-	sctx->curr = -1;
+-	scrub_pending_bio_inc(sctx);
+-	btrfsic_check_bio(sbio->bio);
+-	submit_bio(sbio->bio);
+-}
+-
+-static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
+-				      struct scrub_sector *sector)
+-{
+-	struct scrub_block *sblock = sector->sblock;
+-	struct scrub_bio *sbio;
+-	const u32 sectorsize = sctx->fs_info->sectorsize;
+-	int ret;
+-
+-again:
+ 	/*
+-	 * grab a fresh bio or wait for one to become available
++	 * Data is easier, we just verify the data csum (if we have it).  For
++	 * cases without csum, we have no other choice but to trust it.
+ 	 */
+-	while (sctx->curr == -1) {
+-		spin_lock(&sctx->list_lock);
+-		sctx->curr = sctx->first_free;
+-		if (sctx->curr != -1) {
+-			sctx->first_free = sctx->bios[sctx->curr]->next_free;
+-			sctx->bios[sctx->curr]->next_free = -1;
+-			sctx->bios[sctx->curr]->sector_count = 0;
+-			spin_unlock(&sctx->list_lock);
+-		} else {
+-			spin_unlock(&sctx->list_lock);
+-			wait_event(sctx->list_wait, sctx->first_free != -1);
+-		}
+-	}
+-	sbio = sctx->bios[sctx->curr];
+-	if (sbio->sector_count == 0) {
+-		sbio->physical = sblock->physical + sector->offset;
+-		sbio->logical = sblock->logical + sector->offset;
+-		sbio->dev = sblock->dev;
+-		if (!sbio->bio) {
+-			sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
+-					      REQ_OP_READ, GFP_NOFS);
+-		}
+-		sbio->bio->bi_private = sbio;
+-		sbio->bio->bi_end_io = scrub_bio_end_io;
+-		sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
+-		sbio->status = 0;
+-	} else if (sbio->physical + sbio->sector_count * sectorsize !=
+-		   sblock->physical + sector->offset ||
+-		   sbio->logical + sbio->sector_count * sectorsize !=
+-		   sblock->logical + sector->offset ||
+-		   sbio->dev != sblock->dev) {
+-		scrub_submit(sctx);
+-		goto again;
++	if (!sector->csum) {
++		clear_bit(sector_nr, &stripe->error_bitmap);
++		return;
+ 	}
+ 
+-	sbio->sectors[sbio->sector_count] = sector;
+-	ret = bio_add_scrub_sector(sbio->bio, sector, sectorsize);
+-	if (ret != sectorsize) {
+-		if (sbio->sector_count < 1) {
+-			bio_put(sbio->bio);
+-			sbio->bio = NULL;
+-			return -EIO;
+-		}
+-		scrub_submit(sctx);
+-		goto again;
++	ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum);
++	if (ret < 0) {
++		set_bit(sector_nr, &stripe->csum_error_bitmap);
++		set_bit(sector_nr, &stripe->error_bitmap);
++	} else {
++		clear_bit(sector_nr, &stripe->csum_error_bitmap);
++		clear_bit(sector_nr, &stripe->error_bitmap);
+ 	}
+-
+-	scrub_block_get(sblock); /* one for the page added to the bio */
+-	atomic_inc(&sblock->outstanding_sectors);
+-	sbio->sector_count++;
+-	if (sbio->sector_count == sctx->sectors_per_bio)
+-		scrub_submit(sctx);
+-
+-	return 0;
+-}
+-
+-static void scrub_missing_raid56_end_io(struct bio *bio)
+-{
+-	struct scrub_block *sblock = bio->bi_private;
+-	struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
+-
+-	btrfs_bio_counter_dec(fs_info);
+-	if (bio->bi_status)
+-		sblock->no_io_error_seen = 0;
+-
+-	bio_put(bio);
+-
+-	queue_work(fs_info->scrub_workers, &sblock->work);
+ }
+ 
+-static void scrub_missing_raid56_worker(struct work_struct *work)
++/* Verify specified sectors of a stripe. */
++static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long bitmap)
+ {
+-	struct scrub_block *sblock = container_of(work, struct scrub_block, work);
+-	struct scrub_ctx *sctx = sblock->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	u64 logical;
+-	struct btrfs_device *dev;
+-
+-	logical = sblock->logical;
+-	dev = sblock->dev;
+-
+-	if (sblock->no_io_error_seen)
+-		scrub_recheck_block_checksum(sblock);
+-
+-	if (!sblock->no_io_error_seen) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.read_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_err_rl_in_rcu(fs_info,
+-			"IO error rebuilding logical %llu for dev %s",
+-			logical, btrfs_dev_name(dev));
+-	} else if (sblock->header_error || sblock->checksum_error) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.uncorrectable_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_err_rl_in_rcu(fs_info,
+-			"failed to rebuild valid logical %llu for dev %s",
+-			logical, btrfs_dev_name(dev));
+-	} else {
+-		scrub_write_block_to_dev_replace(sblock);
+-	}
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits;
++	int sector_nr;
+ 
+-	if (sctx->is_dev_replace && sctx->flush_all_writes) {
+-		mutex_lock(&sctx->wr_lock);
+-		scrub_wr_submit(sctx);
+-		mutex_unlock(&sctx->wr_lock);
++	for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) {
++		scrub_verify_one_sector(stripe, sector_nr);
++		if (stripe->sectors[sector_nr].is_metadata)
++			sector_nr += sectors_per_tree - 1;
+ 	}
+-
+-	scrub_block_put(sblock);
+-	scrub_pending_bio_dec(sctx);
+ }
+ 
+-static void scrub_missing_raid56_pages(struct scrub_block *sblock)
++static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first_bvec)
+ {
+-	struct scrub_ctx *sctx = sblock->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	u64 length = sblock->sector_count << fs_info->sectorsize_bits;
+-	u64 logical = sblock->logical;
+-	struct btrfs_io_context *bioc = NULL;
+-	struct bio *bio;
+-	struct btrfs_raid_bio *rbio;
+-	int ret;
+ 	int i;
+ 
+-	btrfs_bio_counter_inc_blocked(fs_info);
+-	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
+-			       &length, &bioc);
+-	if (ret || !bioc || !bioc->raid_map)
+-		goto bioc_out;
+-
+-	if (WARN_ON(!sctx->is_dev_replace ||
+-		    !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
+-		/*
+-		 * We shouldn't be scrubbing a missing device. Even for dev
+-		 * replace, we should only get here for RAID 5/6. We either
+-		 * managed to mount something with no mirrors remaining or
+-		 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
+-		 */
+-		goto bioc_out;
+-	}
+-
+-	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+-	bio->bi_iter.bi_sector = logical >> 9;
+-	bio->bi_private = sblock;
+-	bio->bi_end_io = scrub_missing_raid56_end_io;
+-
+-	rbio = raid56_alloc_missing_rbio(bio, bioc);
+-	if (!rbio)
+-		goto rbio_out;
+-
+-	for (i = 0; i < sblock->sector_count; i++) {
+-		struct scrub_sector *sector = sblock->sectors[i];
+-
+-		raid56_add_scrub_pages(rbio, scrub_sector_get_page(sector),
+-				       scrub_sector_get_page_offset(sector),
+-				       sector->offset + sector->sblock->logical);
++	for (i = 0; i < stripe->nr_sectors; i++) {
++		if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page &&
++		    scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset)
++			break;
+ 	}
+-
+-	INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
+-	scrub_block_get(sblock);
+-	scrub_pending_bio_inc(sctx);
+-	raid56_submit_missing_rbio(rbio);
+-	btrfs_put_bioc(bioc);
+-	return;
+-
+-rbio_out:
+-	bio_put(bio);
+-bioc_out:
+-	btrfs_bio_counter_dec(fs_info);
+-	btrfs_put_bioc(bioc);
+-	spin_lock(&sctx->stat_lock);
+-	sctx->stat.malloc_errors++;
+-	spin_unlock(&sctx->stat_lock);
++	ASSERT(i < stripe->nr_sectors);
++	return i;
+ }
+ 
+-static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
+-		       u64 physical, struct btrfs_device *dev, u64 flags,
+-		       u64 gen, int mirror_num, u8 *csum,
+-		       u64 physical_for_dev_replace)
++/*
++ * Repair read is different to the regular read:
++ *
++ * - Only reads the failed sectors
++ * - May have extra blocksize limits
++ */
++static void scrub_repair_read_endio(struct btrfs_bio *bbio)
+ {
+-	struct scrub_block *sblock;
+-	const u32 sectorsize = sctx->fs_info->sectorsize;
+-	int index;
+-
+-	sblock = alloc_scrub_block(sctx, dev, logical, physical,
+-				   physical_for_dev_replace, mirror_num);
+-	if (!sblock) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.malloc_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		return -ENOMEM;
+-	}
++	struct scrub_stripe *stripe = bbio->private;
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	struct bio_vec *bvec;
++	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
++	u32 bio_size = 0;
++	int i;
+ 
+-	for (index = 0; len > 0; index++) {
+-		struct scrub_sector *sector;
+-		/*
+-		 * Here we will allocate one page for one sector to scrub.
+-		 * This is fine if PAGE_SIZE == sectorsize, but will cost
+-		 * more memory for PAGE_SIZE > sectorsize case.
+-		 */
+-		u32 l = min(sectorsize, len);
++	ASSERT(sector_nr < stripe->nr_sectors);
+ 
+-		sector = alloc_scrub_sector(sblock, logical);
+-		if (!sector) {
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.malloc_errors++;
+-			spin_unlock(&sctx->stat_lock);
+-			scrub_block_put(sblock);
+-			return -ENOMEM;
+-		}
+-		sector->flags = flags;
+-		sector->generation = gen;
+-		if (csum) {
+-			sector->have_csum = 1;
+-			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
+-		} else {
+-			sector->have_csum = 0;
+-		}
+-		len -= l;
+-		logical += l;
+-		physical += l;
+-		physical_for_dev_replace += l;
+-	}
++	bio_for_each_bvec_all(bvec, &bbio->bio, i)
++		bio_size += bvec->bv_len;
+ 
+-	WARN_ON(sblock->sector_count == 0);
+-	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
+-		/*
+-		 * This case should only be hit for RAID 5/6 device replace. See
+-		 * the comment in scrub_missing_raid56_pages() for details.
+-		 */
+-		scrub_missing_raid56_pages(sblock);
++	if (bbio->bio.bi_status) {
++		bitmap_set(&stripe->io_error_bitmap, sector_nr,
++			   bio_size >> fs_info->sectorsize_bits);
++		bitmap_set(&stripe->error_bitmap, sector_nr,
++			   bio_size >> fs_info->sectorsize_bits);
+ 	} else {
+-		for (index = 0; index < sblock->sector_count; index++) {
+-			struct scrub_sector *sector = sblock->sectors[index];
+-			int ret;
+-
+-			ret = scrub_add_sector_to_rd_bio(sctx, sector);
+-			if (ret) {
+-				scrub_block_put(sblock);
+-				return ret;
+-			}
+-		}
+-
+-		if (flags & BTRFS_EXTENT_FLAG_SUPER)
+-			scrub_submit(sctx);
++		bitmap_clear(&stripe->io_error_bitmap, sector_nr,
++			     bio_size >> fs_info->sectorsize_bits);
+ 	}
+-
+-	/* last one frees, either here or in bio completion for last page */
+-	scrub_block_put(sblock);
+-	return 0;
++	bio_put(&bbio->bio);
++	if (atomic_dec_and_test(&stripe->pending_io))
++		wake_up(&stripe->io_wait);
+ }
+ 
+-static void scrub_bio_end_io(struct bio *bio)
++static int calc_next_mirror(int mirror, int num_copies)
+ {
+-	struct scrub_bio *sbio = bio->bi_private;
+-	struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
+-
+-	sbio->status = bio->bi_status;
+-	sbio->bio = bio;
+-
+-	queue_work(fs_info->scrub_workers, &sbio->work);
++	ASSERT(mirror <= num_copies);
++	return (mirror + 1 > num_copies) ? 1 : mirror + 1;
+ }
+ 
+-static void scrub_bio_end_io_worker(struct work_struct *work)
++static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe,
++					    int mirror, int blocksize, bool wait)
+ {
+-	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+-	struct scrub_ctx *sctx = sbio->sctx;
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	struct btrfs_bio *bbio = NULL;
++	const unsigned long old_error_bitmap = stripe->error_bitmap;
+ 	int i;
+ 
+-	ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
+-	if (sbio->status) {
+-		for (i = 0; i < sbio->sector_count; i++) {
+-			struct scrub_sector *sector = sbio->sectors[i];
++	ASSERT(stripe->mirror_num >= 1);
++	ASSERT(atomic_read(&stripe->pending_io) == 0);
++
++	for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) {
++		struct page *page;
++		int pgoff;
++		int ret;
+ 
+-			sector->io_error = 1;
+-			sector->sblock->no_io_error_seen = 0;
++		page = scrub_stripe_get_page(stripe, i);
++		pgoff = scrub_stripe_get_page_offset(stripe, i);
++
++		/* The current sector cannot be merged, submit the bio. */
++		if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) ||
++			     bbio->bio.bi_iter.bi_size >= blocksize)) {
++			ASSERT(bbio->bio.bi_iter.bi_size);
++			atomic_inc(&stripe->pending_io);
++			btrfs_submit_bio(bbio, mirror);
++			if (wait)
++				wait_scrub_stripe_io(stripe);
++			bbio = NULL;
+ 		}
+-	}
+ 
+-	/* Now complete the scrub_block items that have all pages completed */
+-	for (i = 0; i < sbio->sector_count; i++) {
+-		struct scrub_sector *sector = sbio->sectors[i];
+-		struct scrub_block *sblock = sector->sblock;
++		if (!bbio) {
++			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ,
++				fs_info, scrub_repair_read_endio, stripe);
++			bbio->bio.bi_iter.bi_sector = (stripe->logical +
++				(i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT;
++		}
+ 
+-		if (atomic_dec_and_test(&sblock->outstanding_sectors))
+-			scrub_block_complete(sblock);
+-		scrub_block_put(sblock);
++		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
++		ASSERT(ret == fs_info->sectorsize);
+ 	}
+-
+-	bio_put(sbio->bio);
+-	sbio->bio = NULL;
+-	spin_lock(&sctx->list_lock);
+-	sbio->next_free = sctx->first_free;
+-	sctx->first_free = sbio->index;
+-	spin_unlock(&sctx->list_lock);
+-
+-	if (sctx->is_dev_replace && sctx->flush_all_writes) {
+-		mutex_lock(&sctx->wr_lock);
+-		scrub_wr_submit(sctx);
+-		mutex_unlock(&sctx->wr_lock);
++	if (bbio) {
++		ASSERT(bbio->bio.bi_iter.bi_size);
++		atomic_inc(&stripe->pending_io);
++		btrfs_submit_bio(bbio, mirror);
++		if (wait)
++			wait_scrub_stripe_io(stripe);
+ 	}
+-
+-	scrub_pending_bio_dec(sctx);
+ }
+ 
+-static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
+-				       unsigned long *bitmap,
+-				       u64 start, u32 len)
++static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
++				       struct scrub_stripe *stripe)
+ {
+-	u64 offset;
+-	u32 nsectors;
+-	u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
+-
+-	if (len >= sparity->stripe_len) {
+-		bitmap_set(bitmap, 0, sparity->nsectors);
++	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
++				      DEFAULT_RATELIMIT_BURST);
++	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	struct btrfs_device *dev = NULL;
++	u64 physical = 0;
++	int nr_data_sectors = 0;
++	int nr_meta_sectors = 0;
++	int nr_nodatacsum_sectors = 0;
++	int nr_repaired_sectors = 0;
++	int sector_nr;
++
++	if (test_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state))
+ 		return;
+-	}
+ 
+-	start -= sparity->logic_start;
+-	start = div64_u64_rem(start, sparity->stripe_len, &offset);
+-	offset = offset >> sectorsize_bits;
+-	nsectors = len >> sectorsize_bits;
++	/*
++	 * Init needed infos for error reporting.
++	 *
++	 * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
++	 * thus no need for dev/physical, error reporting still needs dev and physical.
++	 */
++	if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
++		u64 mapped_len = fs_info->sectorsize;
++		struct btrfs_io_context *bioc = NULL;
++		int stripe_index = stripe->mirror_num - 1;
++		int ret;
+ 
+-	if (offset + nsectors <= sparity->nsectors) {
+-		bitmap_set(bitmap, offset, nsectors);
+-		return;
++		/* For scrub, our mirror_num should always start at 1. */
++		ASSERT(stripe->mirror_num >= 1);
++		ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
++				       stripe->logical, &mapped_len, &bioc);
++		/*
++		 * If we failed, dev will be NULL, and later detailed reports
++		 * will just be skipped.
++		 */
++		if (ret < 0)
++			goto skip;
++		physical = bioc->stripes[stripe_index].physical;
++		dev = bioc->stripes[stripe_index].dev;
++		btrfs_put_bioc(bioc);
+ 	}
+ 
+-	bitmap_set(bitmap, offset, sparity->nsectors - offset);
+-	bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
+-}
++skip:
++	for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) {
++		bool repaired = false;
+ 
+-static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
+-						   u64 start, u32 len)
+-{
+-	__scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
+-}
++		if (stripe->sectors[sector_nr].is_metadata) {
++			nr_meta_sectors++;
++		} else {
++			nr_data_sectors++;
++			if (!stripe->sectors[sector_nr].csum)
++				nr_nodatacsum_sectors++;
++		}
+ 
+-static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
+-						  u64 start, u32 len)
+-{
+-	__scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
+-}
++		if (test_bit(sector_nr, &stripe->init_error_bitmap) &&
++		    !test_bit(sector_nr, &stripe->error_bitmap)) {
++			nr_repaired_sectors++;
++			repaired = true;
++		}
+ 
+-static void scrub_block_complete(struct scrub_block *sblock)
+-{
+-	int corrupted = 0;
++		/* Good sector from the beginning, nothing need to be done. */
++		if (!test_bit(sector_nr, &stripe->init_error_bitmap))
++			continue;
+ 
+-	if (!sblock->no_io_error_seen) {
+-		corrupted = 1;
+-		scrub_handle_errored_block(sblock);
+-	} else {
+ 		/*
+-		 * if has checksum error, write via repair mechanism in
+-		 * dev replace case, otherwise write here in dev replace
+-		 * case.
++		 * Report error for the corrupted sectors.  If repaired, just
++		 * output the message of repaired message.
+ 		 */
+-		corrupted = scrub_checksum(sblock);
+-		if (!corrupted && sblock->sctx->is_dev_replace)
+-			scrub_write_block_to_dev_replace(sblock);
+-	}
++		if (repaired) {
++			if (dev) {
++				btrfs_err_rl_in_rcu(fs_info,
++			"fixed up error at logical %llu on dev %s physical %llu",
++					    stripe->logical, btrfs_dev_name(dev),
++					    physical);
++			} else {
++				btrfs_err_rl_in_rcu(fs_info,
++			"fixed up error at logical %llu on mirror %u",
++					    stripe->logical, stripe->mirror_num);
++			}
++			continue;
++		}
+ 
+-	if (sblock->sparity && corrupted && !sblock->data_corrected) {
+-		u64 start = sblock->logical;
+-		u64 end = sblock->logical +
+-			  sblock->sectors[sblock->sector_count - 1]->offset +
+-			  sblock->sctx->fs_info->sectorsize;
++		/* The remaining are all for unrepaired. */
++		if (dev) {
++			btrfs_err_rl_in_rcu(fs_info,
++	"unable to fixup (regular) error at logical %llu on dev %s physical %llu",
++					    stripe->logical, btrfs_dev_name(dev),
++					    physical);
++		} else {
++			btrfs_err_rl_in_rcu(fs_info,
++	"unable to fixup (regular) error at logical %llu on mirror %u",
++					    stripe->logical, stripe->mirror_num);
++		}
+ 
+-		ASSERT(end - start <= U32_MAX);
+-		scrub_parity_mark_sectors_error(sblock->sparity,
+-						start, end - start);
++		if (test_bit(sector_nr, &stripe->io_error_bitmap))
++			if (__ratelimit(&rs) && dev)
++				scrub_print_common_warning("i/o error", dev, false,
++						     stripe->logical, physical);
++		if (test_bit(sector_nr, &stripe->csum_error_bitmap))
++			if (__ratelimit(&rs) && dev)
++				scrub_print_common_warning("checksum error", dev, false,
++						     stripe->logical, physical);
++		if (test_bit(sector_nr, &stripe->meta_error_bitmap))
++			if (__ratelimit(&rs) && dev)
++				scrub_print_common_warning("header error", dev, false,
++						     stripe->logical, physical);
+ 	}
+-}
+ 
+-static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
+-{
+-	sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
+-	list_del(&sum->list);
+-	kfree(sum);
++	spin_lock(&sctx->stat_lock);
++	sctx->stat.data_extents_scrubbed += stripe->nr_data_extents;
++	sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents;
++	sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits;
++	sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits;
++	sctx->stat.no_csum += nr_nodatacsum_sectors;
++	sctx->stat.read_errors +=
++		bitmap_weight(&stripe->io_error_bitmap, stripe->nr_sectors);
++	sctx->stat.csum_errors +=
++		bitmap_weight(&stripe->csum_error_bitmap, stripe->nr_sectors);
++	sctx->stat.verify_errors +=
++		bitmap_weight(&stripe->meta_error_bitmap, stripe->nr_sectors);
++	sctx->stat.uncorrectable_errors +=
++		bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors);
++	sctx->stat.corrected_errors += nr_repaired_sectors;
++	spin_unlock(&sctx->stat_lock);
+ }
+ 
+ /*
+- * Find the desired csum for range [logical, logical + sectorsize), and store
+- * the csum into @csum.
++ * The main entrance for all read related scrub work, including:
+  *
+- * The search source is sctx->csum_list, which is a pre-populated list
+- * storing bytenr ordered csum ranges.  We're responsible to cleanup any range
+- * that is before @logical.
++ * - Wait for the initial read to finish
++ * - Verify and locate any bad sectors
++ * - Go through the remaining mirrors and try to read as large blocksize as
++ *   possible
++ * - Go through all mirrors (including the failed mirror) sector-by-sector
+  *
+- * Return 0 if there is no csum for the range.
+- * Return 1 if there is csum for the range and copied to @csum.
++ * Writeback does not happen here, it needs extra synchronization.
+  */
+-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
++static void scrub_stripe_read_repair_worker(struct work_struct *work)
+ {
+-	bool found = false;
+-
+-	while (!list_empty(&sctx->csum_list)) {
+-		struct btrfs_ordered_sum *sum = NULL;
+-		unsigned long index;
+-		unsigned long num_sectors;
+-
+-		sum = list_first_entry(&sctx->csum_list,
+-				       struct btrfs_ordered_sum, list);
+-		/* The current csum range is beyond our range, no csum found */
+-		if (sum->bytenr > logical)
+-			break;
+-
+-		/*
+-		 * The current sum is before our bytenr, since scrub is always
+-		 * done in bytenr order, the csum will never be used anymore,
+-		 * clean it up so that later calls won't bother with the range,
+-		 * and continue search the next range.
+-		 */
+-		if (sum->bytenr + sum->len <= logical) {
+-			drop_csum_range(sctx, sum);
+-			continue;
+-		}
+-
+-		/* Now the csum range covers our bytenr, copy the csum */
+-		found = true;
+-		index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
+-		num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
++	struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work);
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
++					  stripe->bg->length);
++	int mirror;
++	int i;
+ 
+-		memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
+-		       sctx->fs_info->csum_size);
++	ASSERT(stripe->mirror_num > 0);
+ 
+-		/* Cleanup the range if we're at the end of the csum range */
+-		if (index == num_sectors - 1)
+-			drop_csum_range(sctx, sum);
+-		break;
+-	}
+-	if (!found)
+-		return 0;
+-	return 1;
+-}
++	wait_scrub_stripe_io(stripe);
++	scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap);
++	/* Save the initial failed bitmap for later repair and report usage. */
++	stripe->init_error_bitmap = stripe->error_bitmap;
+ 
+-/* scrub extent tries to collect up to 64 kB for each bio */
+-static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
+-			u64 logical, u32 len,
+-			u64 physical, struct btrfs_device *dev, u64 flags,
+-			u64 gen, int mirror_num)
+-{
+-	struct btrfs_device *src_dev = dev;
+-	u64 src_physical = physical;
+-	int src_mirror = mirror_num;
+-	int ret;
+-	u8 csum[BTRFS_CSUM_SIZE];
+-	u32 blocksize;
++	if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors))
++		goto out;
+ 
+-	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+-			blocksize = map->stripe_len;
+-		else
+-			blocksize = sctx->fs_info->sectorsize;
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.data_extents_scrubbed++;
+-		sctx->stat.data_bytes_scrubbed += len;
+-		spin_unlock(&sctx->stat_lock);
+-	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+-		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+-			blocksize = map->stripe_len;
+-		else
+-			blocksize = sctx->fs_info->nodesize;
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.tree_extents_scrubbed++;
+-		sctx->stat.tree_bytes_scrubbed += len;
+-		spin_unlock(&sctx->stat_lock);
+-	} else {
+-		blocksize = sctx->fs_info->sectorsize;
+-		WARN_ON(1);
++	/*
++	 * Try all remaining mirrors.
++	 *
++	 * Here we still try to read as large block as possible, as this is
++	 * faster and we have extra safety nets to rely on.
++	 */
++	for (mirror = calc_next_mirror(stripe->mirror_num, num_copies);
++	     mirror != stripe->mirror_num;
++	     mirror = calc_next_mirror(mirror, num_copies)) {
++		const unsigned long old_error_bitmap = stripe->error_bitmap;
++
++		scrub_stripe_submit_repair_read(stripe, mirror,
++						BTRFS_STRIPE_LEN, false);
++		wait_scrub_stripe_io(stripe);
++		scrub_verify_one_stripe(stripe, old_error_bitmap);
++		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
++			goto out;
+ 	}
+ 
+ 	/*
+-	 * For dev-replace case, we can have @dev being a missing device.
+-	 * Regular scrub will avoid its execution on missing device at all,
+-	 * as that would trigger tons of read error.
++	 * Last safety net, try re-checking all mirrors, including the failed
++	 * one, sector-by-sector.
+ 	 *
+-	 * Reading from missing device will cause read error counts to
+-	 * increase unnecessarily.
+-	 * So here we change the read source to a good mirror.
++	 * As if one sector failed the drive's internal csum, the whole read
++	 * containing the offending sector would be marked as error.
++	 * Thus here we do sector-by-sector read.
++	 *
++	 * This can be slow, thus we only try it as the last resort.
+ 	 */
+-	if (sctx->is_dev_replace && !dev->bdev)
+-		scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
+-				     &src_dev, &src_mirror);
+-	while (len) {
+-		u32 l = min(len, blocksize);
+-		int have_csum = 0;
+-
+-		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+-			/* push csums to sbio */
+-			have_csum = scrub_find_csum(sctx, logical, csum);
+-			if (have_csum == 0)
+-				++sctx->stat.no_csum;
+-		}
+-		ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
+-				    flags, gen, src_mirror,
+-				    have_csum ? csum : NULL, physical);
+-		if (ret)
+-			return ret;
+-		len -= l;
+-		logical += l;
+-		physical += l;
+-		src_physical += l;
++
++	for (i = 0, mirror = stripe->mirror_num;
++	     i < num_copies;
++	     i++, mirror = calc_next_mirror(mirror, num_copies)) {
++		const unsigned long old_error_bitmap = stripe->error_bitmap;
++
++		scrub_stripe_submit_repair_read(stripe, mirror,
++						fs_info->sectorsize, true);
++		wait_scrub_stripe_io(stripe);
++		scrub_verify_one_stripe(stripe, old_error_bitmap);
++		if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors))
++			goto out;
+ 	}
+-	return 0;
++out:
++	scrub_stripe_report_errors(stripe->sctx, stripe);
++	set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state);
++	wake_up(&stripe->repair_wait);
+ }
+ 
+-static int scrub_sectors_for_parity(struct scrub_parity *sparity,
+-				  u64 logical, u32 len,
+-				  u64 physical, struct btrfs_device *dev,
+-				  u64 flags, u64 gen, int mirror_num, u8 *csum)
++static void scrub_read_endio(struct btrfs_bio *bbio)
+ {
+-	struct scrub_ctx *sctx = sparity->sctx;
+-	struct scrub_block *sblock;
+-	const u32 sectorsize = sctx->fs_info->sectorsize;
+-	int index;
+-
+-	ASSERT(IS_ALIGNED(len, sectorsize));
++	struct scrub_stripe *stripe = bbio->private;
+ 
+-	sblock = alloc_scrub_block(sctx, dev, logical, physical, physical, mirror_num);
+-	if (!sblock) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.malloc_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		return -ENOMEM;
++	if (bbio->bio.bi_status) {
++		bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
++		bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
++	} else {
++		bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
++	}
++	bio_put(&bbio->bio);
++	if (atomic_dec_and_test(&stripe->pending_io)) {
++		wake_up(&stripe->io_wait);
++		INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker);
++		queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work);
+ 	}
++}
+ 
+-	sblock->sparity = sparity;
+-	scrub_parity_get(sparity);
++static void scrub_write_endio(struct btrfs_bio *bbio)
++{
++	struct scrub_stripe *stripe = bbio->private;
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	struct bio_vec *bvec;
++	int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
++	u32 bio_size = 0;
++	int i;
+ 
+-	for (index = 0; len > 0; index++) {
+-		struct scrub_sector *sector;
++	bio_for_each_bvec_all(bvec, &bbio->bio, i)
++		bio_size += bvec->bv_len;
+ 
+-		sector = alloc_scrub_sector(sblock, logical);
+-		if (!sector) {
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.malloc_errors++;
+-			spin_unlock(&sctx->stat_lock);
+-			scrub_block_put(sblock);
+-			return -ENOMEM;
+-		}
+-		sblock->sectors[index] = sector;
+-		/* For scrub parity */
+-		scrub_sector_get(sector);
+-		list_add_tail(&sector->list, &sparity->sectors_list);
+-		sector->flags = flags;
+-		sector->generation = gen;
+-		if (csum) {
+-			sector->have_csum = 1;
+-			memcpy(sector->csum, csum, sctx->fs_info->csum_size);
+-		} else {
+-			sector->have_csum = 0;
+-		}
++	if (bbio->bio.bi_status) {
++		unsigned long flags;
+ 
+-		/* Iterate over the stripe range in sectorsize steps */
+-		len -= sectorsize;
+-		logical += sectorsize;
+-		physical += sectorsize;
++		spin_lock_irqsave(&stripe->write_error_lock, flags);
++		bitmap_set(&stripe->write_error_bitmap, sector_nr,
++			   bio_size >> fs_info->sectorsize_bits);
++		spin_unlock_irqrestore(&stripe->write_error_lock, flags);
+ 	}
++	bio_put(&bbio->bio);
++
++	if (atomic_dec_and_test(&stripe->pending_io))
++		wake_up(&stripe->io_wait);
++}
++
++/*
++ * Submit the write bio(s) for the sectors specified by @write_bitmap.
++ *
++ * Here we utilize btrfs_submit_repair_write(), which has some extra benefits:
++ *
++ * - Only needs logical bytenr and mirror_num
++ *   Just like the scrub read path
++ *
++ * - Would only result in writes to the specified mirror
++ *   Unlike the regular writeback path, which would write back to all stripes
++ *
++ * - Handle dev-replace and read-repair writeback differently
++ */
++static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe,
++				unsigned long write_bitmap, bool dev_replace)
++{
++	struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++	struct btrfs_bio *bbio = NULL;
++	const bool zoned = btrfs_is_zoned(fs_info);
++	int sector_nr;
+ 
+-	WARN_ON(sblock->sector_count == 0);
+-	for (index = 0; index < sblock->sector_count; index++) {
+-		struct scrub_sector *sector = sblock->sectors[index];
++	for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) {
++		struct page *page = scrub_stripe_get_page(stripe, sector_nr);
++		unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr);
+ 		int ret;
+ 
+-		ret = scrub_add_sector_to_rd_bio(sctx, sector);
+-		if (ret) {
+-			scrub_block_put(sblock);
+-			return ret;
++		/* We should only writeback sectors covered by an extent. */
++		ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap));
++
++		/* Cannot merge with previous sector, submit the current one. */
++		if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) {
++			fill_writer_pointer_gap(sctx, stripe->physical +
++					(sector_nr << fs_info->sectorsize_bits));
++			atomic_inc(&stripe->pending_io);
++			btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
++			/* For zoned writeback, queue depth must be 1. */
++			if (zoned)
++				wait_scrub_stripe_io(stripe);
++			bbio = NULL;
+ 		}
++		if (!bbio) {
++			bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_WRITE,
++					       fs_info, scrub_write_endio, stripe);
++			bbio->bio.bi_iter.bi_sector = (stripe->logical +
++				(sector_nr << fs_info->sectorsize_bits)) >>
++				SECTOR_SHIFT;
++		}
++		ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
++		ASSERT(ret == fs_info->sectorsize);
++	}
++	if (bbio) {
++		fill_writer_pointer_gap(sctx, bbio->bio.bi_iter.bi_sector <<
++					SECTOR_SHIFT);
++		atomic_inc(&stripe->pending_io);
++		btrfs_submit_repair_write(bbio, stripe->mirror_num, dev_replace);
++		if (zoned)
++			wait_scrub_stripe_io(stripe);
+ 	}
+-
+-	/* Last one frees, either here or in bio completion for last sector */
+-	scrub_block_put(sblock);
+-	return 0;
+ }
+ 
+-static int scrub_extent_for_parity(struct scrub_parity *sparity,
+-				   u64 logical, u32 len,
+-				   u64 physical, struct btrfs_device *dev,
+-				   u64 flags, u64 gen, int mirror_num)
++/*
++ * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
++ * second.  Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
++ */
++static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *device,
++				  unsigned int bio_size)
+ {
+-	struct scrub_ctx *sctx = sparity->sctx;
+-	int ret;
+-	u8 csum[BTRFS_CSUM_SIZE];
+-	u32 blocksize;
++	const int time_slice = 1000;
++	s64 delta;
++	ktime_t now;
++	u32 div;
++	u64 bwlimit;
+ 
+-	if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
+-		scrub_parity_mark_sectors_error(sparity, logical, len);
+-		return 0;
++	bwlimit = READ_ONCE(device->scrub_speed_max);
++	if (bwlimit == 0)
++		return;
++
++	/*
++	 * Slice is divided into intervals when the IO is submitted, adjust by
++	 * bwlimit and maximum of 64 intervals.
++	 */
++	div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
++	div = min_t(u32, 64, div);
++
++	/* Start new epoch, set deadline */
++	now = ktime_get();
++	if (sctx->throttle_deadline == 0) {
++		sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
++		sctx->throttle_sent = 0;
+ 	}
+ 
+-	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+-		blocksize = sparity->stripe_len;
+-	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+-		blocksize = sparity->stripe_len;
++	/* Still in the time to send? */
++	if (ktime_before(now, sctx->throttle_deadline)) {
++		/* If current bio is within the limit, send it */
++		sctx->throttle_sent += bio_size;
++		if (sctx->throttle_sent <= div_u64(bwlimit, div))
++			return;
++
++		/* We're over the limit, sleep until the rest of the slice */
++		delta = ktime_ms_delta(sctx->throttle_deadline, now);
+ 	} else {
+-		blocksize = sctx->fs_info->sectorsize;
+-		WARN_ON(1);
++		/* New request after deadline, start new epoch */
++		delta = 0;
+ 	}
+ 
+-	while (len) {
+-		u32 l = min(len, blocksize);
+-		int have_csum = 0;
++	if (delta) {
++		long timeout;
+ 
+-		if (flags & BTRFS_EXTENT_FLAG_DATA) {
+-			/* push csums to sbio */
+-			have_csum = scrub_find_csum(sctx, logical, csum);
+-			if (have_csum == 0)
+-				goto skip;
+-		}
+-		ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
+-					     flags, gen, mirror_num,
+-					     have_csum ? csum : NULL);
+-		if (ret)
+-			return ret;
+-skip:
+-		len -= l;
+-		logical += l;
+-		physical += l;
++		timeout = div_u64(delta * HZ, 1000);
++		schedule_timeout_interruptible(timeout);
+ 	}
+-	return 0;
++
++	/* Next call will start the deadline period */
++	sctx->throttle_deadline = 0;
+ }
+ 
+ /*
+@@ -2908,10 +1266,7 @@ static int get_raid56_logic_offset(u64 physical, int num,
+ {
+ 	int i;
+ 	int j = 0;
+-	u64 stripe_nr;
+ 	u64 last_offset;
+-	u32 stripe_index;
+-	u32 rot;
+ 	const int data_stripes = nr_data_stripes(map);
+ 
+ 	last_offset = (physical - map->stripes[num].physical) * data_stripes;
+@@ -2920,13 +1275,17 @@ static int get_raid56_logic_offset(u64 physical, int num,
+ 
+ 	*offset = last_offset;
+ 	for (i = 0; i < data_stripes; i++) {
+-		*offset = last_offset + i * map->stripe_len;
++		u32 stripe_nr;
++		u32 stripe_index;
++		u32 rot;
+ 
+-		stripe_nr = div64_u64(*offset, map->stripe_len);
+-		stripe_nr = div_u64(stripe_nr, data_stripes);
++		*offset = last_offset + (i << BTRFS_STRIPE_LEN_SHIFT);
++
++		stripe_nr = (u32)(*offset >> BTRFS_STRIPE_LEN_SHIFT) / data_stripes;
+ 
+ 		/* Work out the disk rotation on this stripe-set */
+-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
++		rot = stripe_nr % map->num_stripes;
++		stripe_nr /= map->num_stripes;
+ 		/* calculate which stripe this data locates */
+ 		rot += i;
+ 		stripe_index = rot % map->num_stripes;
+@@ -2935,123 +1294,10 @@ static int get_raid56_logic_offset(u64 physical, int num,
+ 		if (stripe_index < num)
+ 			j++;
+ 	}
+-	*offset = last_offset + j * map->stripe_len;
++	*offset = last_offset + (j << BTRFS_STRIPE_LEN_SHIFT);
+ 	return 1;
+ }
+ 
+-static void scrub_free_parity(struct scrub_parity *sparity)
+-{
+-	struct scrub_ctx *sctx = sparity->sctx;
+-	struct scrub_sector *curr, *next;
+-	int nbits;
+-
+-	nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
+-	if (nbits) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.read_errors += nbits;
+-		sctx->stat.uncorrectable_errors += nbits;
+-		spin_unlock(&sctx->stat_lock);
+-	}
+-
+-	list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
+-		list_del_init(&curr->list);
+-		scrub_sector_put(curr);
+-	}
+-
+-	kfree(sparity);
+-}
+-
+-static void scrub_parity_bio_endio_worker(struct work_struct *work)
+-{
+-	struct scrub_parity *sparity = container_of(work, struct scrub_parity,
+-						    work);
+-	struct scrub_ctx *sctx = sparity->sctx;
+-
+-	btrfs_bio_counter_dec(sctx->fs_info);
+-	scrub_free_parity(sparity);
+-	scrub_pending_bio_dec(sctx);
+-}
+-
+-static void scrub_parity_bio_endio(struct bio *bio)
+-{
+-	struct scrub_parity *sparity = bio->bi_private;
+-	struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
+-
+-	if (bio->bi_status)
+-		bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
+-			  &sparity->dbitmap, sparity->nsectors);
+-
+-	bio_put(bio);
+-
+-	INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
+-	queue_work(fs_info->scrub_parity_workers, &sparity->work);
+-}
+-
+-static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
+-{
+-	struct scrub_ctx *sctx = sparity->sctx;
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	struct bio *bio;
+-	struct btrfs_raid_bio *rbio;
+-	struct btrfs_io_context *bioc = NULL;
+-	u64 length;
+-	int ret;
+-
+-	if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
+-			   &sparity->ebitmap, sparity->nsectors))
+-		goto out;
+-
+-	length = sparity->logic_end - sparity->logic_start;
+-
+-	btrfs_bio_counter_inc_blocked(fs_info);
+-	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
+-			       &length, &bioc);
+-	if (ret || !bioc || !bioc->raid_map)
+-		goto bioc_out;
+-
+-	bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
+-	bio->bi_iter.bi_sector = sparity->logic_start >> 9;
+-	bio->bi_private = sparity;
+-	bio->bi_end_io = scrub_parity_bio_endio;
+-
+-	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
+-					      sparity->scrub_dev,
+-					      &sparity->dbitmap,
+-					      sparity->nsectors);
+-	btrfs_put_bioc(bioc);
+-	if (!rbio)
+-		goto rbio_out;
+-
+-	scrub_pending_bio_inc(sctx);
+-	raid56_parity_submit_scrub_rbio(rbio);
+-	return;
+-
+-rbio_out:
+-	bio_put(bio);
+-bioc_out:
+-	btrfs_bio_counter_dec(fs_info);
+-	bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
+-		  sparity->nsectors);
+-	spin_lock(&sctx->stat_lock);
+-	sctx->stat.malloc_errors++;
+-	spin_unlock(&sctx->stat_lock);
+-out:
+-	scrub_free_parity(sparity);
+-}
+-
+-static void scrub_parity_get(struct scrub_parity *sparity)
+-{
+-	refcount_inc(&sparity->refs);
+-}
+-
+-static void scrub_parity_put(struct scrub_parity *sparity)
+-{
+-	if (!refcount_dec_and_test(&sparity->refs))
+-		return;
+-
+-	scrub_parity_check_and_repair(sparity);
+-}
+-
+ /*
+  * Return 0 if the extent item range covers any byte of the range.
+  * Return <0 if the extent item is before @search_start.
+@@ -3178,226 +1424,533 @@ static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
+ 	*generation_ret = btrfs_extent_generation(path->nodes[0], ei);
+ }
+ 
+-static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
+-				      u64 boundary_start, u64 boudary_len)
++static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
++					u64 physical, u64 physical_end)
++{
++	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	int ret = 0;
++
++	if (!btrfs_is_zoned(fs_info))
++		return 0;
++
++	mutex_lock(&sctx->wr_lock);
++	if (sctx->write_pointer < physical_end) {
++		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
++						    physical,
++						    sctx->write_pointer);
++		if (ret)
++			btrfs_err(fs_info,
++				  "zoned: failed to recover write pointer");
++	}
++	mutex_unlock(&sctx->wr_lock);
++	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
++
++	return ret;
++}
++
++static void fill_one_extent_info(struct btrfs_fs_info *fs_info,
++				 struct scrub_stripe *stripe,
++				 u64 extent_start, u64 extent_len,
++				 u64 extent_flags, u64 extent_gen)
++{
++	for (u64 cur_logical = max(stripe->logical, extent_start);
++	     cur_logical < min(stripe->logical + BTRFS_STRIPE_LEN,
++			       extent_start + extent_len);
++	     cur_logical += fs_info->sectorsize) {
++		const int nr_sector = (cur_logical - stripe->logical) >>
++				      fs_info->sectorsize_bits;
++		struct scrub_sector_verification *sector =
++						&stripe->sectors[nr_sector];
++
++		set_bit(nr_sector, &stripe->extent_sector_bitmap);
++		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
++			sector->is_metadata = true;
++			sector->generation = extent_gen;
++		}
++	}
++}
++
++static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe)
++{
++	stripe->extent_sector_bitmap = 0;
++	stripe->init_error_bitmap = 0;
++	stripe->error_bitmap = 0;
++	stripe->io_error_bitmap = 0;
++	stripe->csum_error_bitmap = 0;
++	stripe->meta_error_bitmap = 0;
++}
++
++/*
++ * Locate one stripe which has at least one extent in its range.
++ *
++ * Return 0 if found such stripe, and store its info into @stripe.
++ * Return >0 if there is no such stripe in the specified range.
++ * Return <0 for error.
++ */
++static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg,
++					struct btrfs_device *dev, u64 physical,
++					int mirror_num, u64 logical_start,
++					u32 logical_len,
++					struct scrub_stripe *stripe)
++{
++	struct btrfs_fs_info *fs_info = bg->fs_info;
++	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start);
++	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start);
++	const u64 logical_end = logical_start + logical_len;
++	struct btrfs_path path = { 0 };
++	u64 cur_logical = logical_start;
++	u64 stripe_end;
++	u64 extent_start;
++	u64 extent_len;
++	u64 extent_flags;
++	u64 extent_gen;
++	int ret;
++
++	memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) *
++				   stripe->nr_sectors);
++	scrub_stripe_reset_bitmaps(stripe);
++
++	/* The range must be inside the bg. */
++	ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
++
++	path.search_commit_root = 1;
++	path.skip_locking = 1;
++
++	ret = find_first_extent_item(extent_root, &path, logical_start, logical_len);
++	/* Either error or not found. */
++	if (ret)
++		goto out;
++	get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen);
++	if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
++		stripe->nr_meta_extents++;
++	if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
++		stripe->nr_data_extents++;
++	cur_logical = max(extent_start, cur_logical);
++
++	/*
++	 * Round down to stripe boundary.
++	 *
++	 * The extra calculation against bg->start is to handle block groups
++	 * whose logical bytenr is not BTRFS_STRIPE_LEN aligned.
++	 */
++	stripe->logical = round_down(cur_logical - bg->start, BTRFS_STRIPE_LEN) +
++			  bg->start;
++	stripe->physical = physical + stripe->logical - logical_start;
++	stripe->dev = dev;
++	stripe->bg = bg;
++	stripe->mirror_num = mirror_num;
++	stripe_end = stripe->logical + BTRFS_STRIPE_LEN - 1;
++
++	/* Fill the first extent info into stripe->sectors[] array. */
++	fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
++			     extent_flags, extent_gen);
++	cur_logical = extent_start + extent_len;
++
++	/* Fill the extent info for the remaining sectors. */
++	while (cur_logical <= stripe_end) {
++		ret = find_first_extent_item(extent_root, &path, cur_logical,
++					     stripe_end - cur_logical + 1);
++		if (ret < 0)
++			goto out;
++		if (ret > 0) {
++			ret = 0;
++			break;
++		}
++		get_extent_info(&path, &extent_start, &extent_len,
++				&extent_flags, &extent_gen);
++		if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
++			stripe->nr_meta_extents++;
++		if (extent_flags & BTRFS_EXTENT_FLAG_DATA)
++			stripe->nr_data_extents++;
++		fill_one_extent_info(fs_info, stripe, extent_start, extent_len,
++				     extent_flags, extent_gen);
++		cur_logical = extent_start + extent_len;
++	}
++
++	/* Now fill the data csum. */
++	if (bg->flags & BTRFS_BLOCK_GROUP_DATA) {
++		int sector_nr;
++		unsigned long csum_bitmap = 0;
++
++		/* Csum space should have already been allocated. */
++		ASSERT(stripe->csums);
++
++		/*
++		 * Our csum bitmap should be large enough, as BTRFS_STRIPE_LEN
++		 * should contain at most 16 sectors.
++		 */
++		ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
++
++		ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical,
++						stripe_end, stripe->csums,
++						&csum_bitmap, true);
++		if (ret < 0)
++			goto out;
++		if (ret > 0)
++			ret = 0;
++
++		for_each_set_bit(sector_nr, &csum_bitmap, stripe->nr_sectors) {
++			stripe->sectors[sector_nr].csum = stripe->csums +
++				sector_nr * fs_info->csum_size;
++		}
++	}
++	set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
++out:
++	btrfs_release_path(&path);
++	return ret;
++}
++
++static void scrub_reset_stripe(struct scrub_stripe *stripe)
++{
++	scrub_stripe_reset_bitmaps(stripe);
++
++	stripe->nr_meta_extents = 0;
++	stripe->nr_data_extents = 0;
++	stripe->state = 0;
++
++	for (int i = 0; i < stripe->nr_sectors; i++) {
++		stripe->sectors[i].is_metadata = false;
++		stripe->sectors[i].csum = NULL;
++		stripe->sectors[i].generation = 0;
++	}
++}
++
++static void scrub_submit_initial_read(struct scrub_ctx *sctx,
++				      struct scrub_stripe *stripe)
++{
++	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	struct btrfs_bio *bbio;
++	int mirror = stripe->mirror_num;
++
++	ASSERT(stripe->bg);
++	ASSERT(stripe->mirror_num > 0);
++	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
++
++	bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
++			       scrub_read_endio, stripe);
++
++	/* Read the whole stripe. */
++	bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
++	for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
++		int ret;
++
++		ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
++		/* We should have allocated enough bio vectors. */
++		ASSERT(ret == PAGE_SIZE);
++	}
++	atomic_inc(&stripe->pending_io);
++
++	/*
++	 * For dev-replace, either user asks to avoid the source dev, or
++	 * the device is missing, we try the next mirror instead.
++	 */
++	if (sctx->is_dev_replace &&
++	    (fs_info->dev_replace.cont_reading_from_srcdev_mode ==
++	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID ||
++	     !stripe->dev->bdev)) {
++		int num_copies = btrfs_num_copies(fs_info, stripe->bg->start,
++						  stripe->bg->length);
++
++		mirror = calc_next_mirror(mirror, num_copies);
++	}
++	btrfs_submit_bio(bbio, mirror);
++}
++
++static bool stripe_has_metadata_error(struct scrub_stripe *stripe)
+ {
+-	return (extent_start < boundary_start &&
+-		extent_start + extent_len > boundary_start) ||
+-	       (extent_start < boundary_start + boudary_len &&
+-		extent_start + extent_len > boundary_start + boudary_len);
++	int i;
++
++	for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) {
++		if (stripe->sectors[i].is_metadata) {
++			struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
++
++			btrfs_err(fs_info,
++			"stripe %llu has unrepaired metadata sector at %llu",
++				  stripe->logical,
++				  stripe->logical + (i << fs_info->sectorsize_bits));
++			return true;
++		}
++	}
++	return false;
+ }
+ 
+-static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
+-					       struct scrub_parity *sparity,
+-					       struct map_lookup *map,
+-					       struct btrfs_device *sdev,
+-					       struct btrfs_path *path,
+-					       u64 logical)
++static int flush_scrub_stripes(struct scrub_ctx *sctx)
+ {
+ 	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
+-	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
+-	u64 cur_logical = logical;
+-	int ret;
++	struct scrub_stripe *stripe;
++	const int nr_stripes = sctx->cur_stripe;
++	int ret = 0;
+ 
+-	ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
++	if (!nr_stripes)
++		return 0;
+ 
+-	/* Path must not be populated */
+-	ASSERT(!path->nodes[0]);
++	ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state));
+ 
+-	while (cur_logical < logical + map->stripe_len) {
+-		struct btrfs_io_context *bioc = NULL;
+-		struct btrfs_device *extent_dev;
+-		u64 extent_start;
+-		u64 extent_size;
+-		u64 mapped_length;
+-		u64 extent_flags;
+-		u64 extent_gen;
+-		u64 extent_physical;
+-		u64 extent_mirror_num;
+-
+-		ret = find_first_extent_item(extent_root, path, cur_logical,
+-					     logical + map->stripe_len - cur_logical);
+-		/* No more extent item in this data stripe */
+-		if (ret > 0) {
+-			ret = 0;
+-			break;
+-		}
+-		if (ret < 0)
+-			break;
+-		get_extent_info(path, &extent_start, &extent_size, &extent_flags,
+-				&extent_gen);
++	scrub_throttle_dev_io(sctx, sctx->stripes[0].dev,
++			      nr_stripes << BTRFS_STRIPE_LEN_SHIFT);
++	for (int i = 0; i < nr_stripes; i++) {
++		stripe = &sctx->stripes[i];
++		scrub_submit_initial_read(sctx, stripe);
++	}
+ 
+-		/* Metadata should not cross stripe boundaries */
+-		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+-		    does_range_cross_boundary(extent_start, extent_size,
+-					      logical, map->stripe_len)) {
+-			btrfs_err(fs_info,
+-	"scrub: tree block %llu spanning stripes, ignored. logical=%llu",
+-				  extent_start, logical);
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.uncorrectable_errors++;
+-			spin_unlock(&sctx->stat_lock);
+-			cur_logical += extent_size;
+-			continue;
+-		}
++	for (int i = 0; i < nr_stripes; i++) {
++		stripe = &sctx->stripes[i];
++
++		wait_event(stripe->repair_wait,
++			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
++	}
+ 
+-		/* Skip hole range which doesn't have any extent */
+-		cur_logical = max(extent_start, cur_logical);
++	/*
++	 * Submit the repaired sectors.  For zoned case, we cannot do repair
++	 * in-place, but queue the bg to be relocated.
++	 */
++	if (btrfs_is_zoned(fs_info)) {
++		for (int i = 0; i < nr_stripes; i++) {
++			stripe = &sctx->stripes[i];
+ 
+-		/* Truncate the range inside this data stripe */
+-		extent_size = min(extent_start + extent_size,
+-				  logical + map->stripe_len) - cur_logical;
+-		extent_start = cur_logical;
+-		ASSERT(extent_size <= U32_MAX);
++			if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) {
++				btrfs_repair_one_zone(fs_info,
++						      sctx->stripes[0].bg->start);
++				break;
++			}
++		}
++	} else {
++		for (int i = 0; i < nr_stripes; i++) {
++			unsigned long repaired;
+ 
+-		scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
++			stripe = &sctx->stripes[i];
+ 
+-		mapped_length = extent_size;
+-		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
+-				      &mapped_length, &bioc, 0);
+-		if (!ret && (!bioc || mapped_length < extent_size))
+-			ret = -EIO;
+-		if (ret) {
+-			btrfs_put_bioc(bioc);
+-			scrub_parity_mark_sectors_error(sparity, extent_start,
+-							extent_size);
+-			break;
++			bitmap_andnot(&repaired, &stripe->init_error_bitmap,
++				      &stripe->error_bitmap, stripe->nr_sectors);
++			scrub_write_sectors(sctx, stripe, repaired, false);
+ 		}
+-		extent_physical = bioc->stripes[0].physical;
+-		extent_mirror_num = bioc->mirror_num;
+-		extent_dev = bioc->stripes[0].dev;
+-		btrfs_put_bioc(bioc);
++	}
+ 
+-		ret = btrfs_lookup_csums_list(csum_root, extent_start,
+-					      extent_start + extent_size - 1,
+-					      &sctx->csum_list, 1, false);
+-		if (ret) {
+-			scrub_parity_mark_sectors_error(sparity, extent_start,
+-							extent_size);
+-			break;
++	/* Submit for dev-replace. */
++	if (sctx->is_dev_replace) {
++		/*
++		 * For dev-replace, if we know there is something wrong with
++		 * metadata, we should immedately abort.
++		 */
++		for (int i = 0; i < nr_stripes; i++) {
++			if (stripe_has_metadata_error(&sctx->stripes[i])) {
++				ret = -EIO;
++				goto out;
++			}
+ 		}
++		for (int i = 0; i < nr_stripes; i++) {
++			unsigned long good;
+ 
+-		ret = scrub_extent_for_parity(sparity, extent_start,
+-					      extent_size, extent_physical,
+-					      extent_dev, extent_flags,
+-					      extent_gen, extent_mirror_num);
+-		scrub_free_csums(sctx);
++			stripe = &sctx->stripes[i];
+ 
+-		if (ret) {
+-			scrub_parity_mark_sectors_error(sparity, extent_start,
+-							extent_size);
+-			break;
++			ASSERT(stripe->dev == fs_info->dev_replace.srcdev);
++
++			bitmap_andnot(&good, &stripe->extent_sector_bitmap,
++				      &stripe->error_bitmap, stripe->nr_sectors);
++			scrub_write_sectors(sctx, stripe, good, true);
+ 		}
++	}
+ 
+-		cond_resched();
+-		cur_logical += extent_size;
++	/* Wait for the above writebacks to finish. */
++	for (int i = 0; i < nr_stripes; i++) {
++		stripe = &sctx->stripes[i];
++
++		wait_scrub_stripe_io(stripe);
++		scrub_reset_stripe(stripe);
+ 	}
+-	btrfs_release_path(path);
++out:
++	sctx->cur_stripe = 0;
+ 	return ret;
+ }
+ 
+-static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
+-						  struct map_lookup *map,
+-						  struct btrfs_device *sdev,
+-						  u64 logic_start,
+-						  u64 logic_end)
++static void raid56_scrub_wait_endio(struct bio *bio)
+ {
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	struct btrfs_path *path;
+-	u64 cur_logical;
++	complete(bio->bi_private);
++}
++
++static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg,
++			      struct btrfs_device *dev, int mirror_num,
++			      u64 logical, u32 length, u64 physical)
++{
++	struct scrub_stripe *stripe;
+ 	int ret;
+-	struct scrub_parity *sparity;
+-	int nsectors;
+ 
+-	path = btrfs_alloc_path();
+-	if (!path) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.malloc_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		return -ENOMEM;
++	/* No available slot, submit all stripes and wait for them. */
++	if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) {
++		ret = flush_scrub_stripes(sctx);
++		if (ret < 0)
++			return ret;
+ 	}
+-	path->search_commit_root = 1;
+-	path->skip_locking = 1;
+ 
+-	ASSERT(map->stripe_len <= U32_MAX);
+-	nsectors = map->stripe_len >> fs_info->sectorsize_bits;
+-	ASSERT(nsectors <= BITS_PER_LONG);
+-	sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
+-	if (!sparity) {
+-		spin_lock(&sctx->stat_lock);
+-		sctx->stat.malloc_errors++;
+-		spin_unlock(&sctx->stat_lock);
+-		btrfs_free_path(path);
+-		return -ENOMEM;
+-	}
++	stripe = &sctx->stripes[sctx->cur_stripe];
++
++	/* We can queue one stripe using the remaining slot. */
++	scrub_reset_stripe(stripe);
++	ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num,
++					   logical, length, stripe);
++	/* Either >0 as no more extents or <0 for error. */
++	if (ret)
++		return ret;
++	sctx->cur_stripe++;
++	return 0;
++}
+ 
+-	ASSERT(map->stripe_len <= U32_MAX);
+-	sparity->stripe_len = map->stripe_len;
+-	sparity->nsectors = nsectors;
+-	sparity->sctx = sctx;
+-	sparity->scrub_dev = sdev;
+-	sparity->logic_start = logic_start;
+-	sparity->logic_end = logic_end;
+-	refcount_set(&sparity->refs, 1);
+-	INIT_LIST_HEAD(&sparity->sectors_list);
++static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
++				      struct btrfs_device *scrub_dev,
++				      struct btrfs_block_group *bg,
++				      struct map_lookup *map,
++				      u64 full_stripe_start)
++{
++	DECLARE_COMPLETION_ONSTACK(io_done);
++	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	struct btrfs_raid_bio *rbio;
++	struct btrfs_io_context *bioc = NULL;
++	struct bio *bio;
++	struct scrub_stripe *stripe;
++	bool all_empty = true;
++	const int data_stripes = nr_data_stripes(map);
++	unsigned long extent_bitmap = 0;
++	u64 length = data_stripes << BTRFS_STRIPE_LEN_SHIFT;
++	int ret;
+ 
+-	ret = 0;
+-	for (cur_logical = logic_start; cur_logical < logic_end;
+-	     cur_logical += map->stripe_len) {
+-		ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
+-							  sdev, path, cur_logical);
++	ASSERT(sctx->raid56_data_stripes);
++
++	for (int i = 0; i < data_stripes; i++) {
++		int stripe_index;
++		int rot;
++		u64 physical;
++
++		stripe = &sctx->raid56_data_stripes[i];
++		rot = div_u64(full_stripe_start - bg->start,
++			      data_stripes) >> BTRFS_STRIPE_LEN_SHIFT;
++		stripe_index = (i + rot) % map->num_stripes;
++		physical = map->stripes[stripe_index].physical +
++			   (rot << BTRFS_STRIPE_LEN_SHIFT);
++
++		scrub_reset_stripe(stripe);
++		set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state);
++		ret = scrub_find_fill_first_stripe(bg,
++				map->stripes[stripe_index].dev, physical, 1,
++				full_stripe_start + (i << BTRFS_STRIPE_LEN_SHIFT),
++				BTRFS_STRIPE_LEN, stripe);
+ 		if (ret < 0)
++			goto out;
++		/*
++		 * No extent in this data stripe, need to manually mark them
++		 * initialized to make later read submission happy.
++		 */
++		if (ret > 0) {
++			stripe->logical = full_stripe_start +
++					  (i << BTRFS_STRIPE_LEN_SHIFT);
++			stripe->dev = map->stripes[stripe_index].dev;
++			stripe->mirror_num = 1;
++			set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state);
++		}
++	}
++
++	/* Check if all data stripes are empty. */
++	for (int i = 0; i < data_stripes; i++) {
++		stripe = &sctx->raid56_data_stripes[i];
++		if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) {
++			all_empty = false;
+ 			break;
++		}
++	}
++	if (all_empty) {
++		ret = 0;
++		goto out;
+ 	}
+ 
+-	scrub_parity_put(sparity);
+-	scrub_submit(sctx);
+-	mutex_lock(&sctx->wr_lock);
+-	scrub_wr_submit(sctx);
+-	mutex_unlock(&sctx->wr_lock);
++	for (int i = 0; i < data_stripes; i++) {
++		stripe = &sctx->raid56_data_stripes[i];
++		scrub_submit_initial_read(sctx, stripe);
++	}
++	for (int i = 0; i < data_stripes; i++) {
++		stripe = &sctx->raid56_data_stripes[i];
+ 
+-	btrfs_free_path(path);
+-	return ret < 0 ? ret : 0;
+-}
++		wait_event(stripe->repair_wait,
++			   test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state));
++	}
++	/* For now, no zoned support for RAID56. */
++	ASSERT(!btrfs_is_zoned(sctx->fs_info));
+ 
+-static void sync_replace_for_zoned(struct scrub_ctx *sctx)
+-{
+-	if (!btrfs_is_zoned(sctx->fs_info))
+-		return;
++	/* Writeback for the repaired sectors. */
++	for (int i = 0; i < data_stripes; i++) {
++		unsigned long repaired;
+ 
+-	sctx->flush_all_writes = true;
+-	scrub_submit(sctx);
+-	mutex_lock(&sctx->wr_lock);
+-	scrub_wr_submit(sctx);
+-	mutex_unlock(&sctx->wr_lock);
++		stripe = &sctx->raid56_data_stripes[i];
+ 
+-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+-}
++		bitmap_andnot(&repaired, &stripe->init_error_bitmap,
++			      &stripe->error_bitmap, stripe->nr_sectors);
++		scrub_write_sectors(sctx, stripe, repaired, false);
++	}
+ 
+-static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+-					u64 physical, u64 physical_end)
+-{
+-	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	int ret = 0;
++	/* Wait for the above writebacks to finish. */
++	for (int i = 0; i < data_stripes; i++) {
++		stripe = &sctx->raid56_data_stripes[i];
+ 
+-	if (!btrfs_is_zoned(fs_info))
+-		return 0;
++		wait_scrub_stripe_io(stripe);
++	}
+ 
+-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
++	/*
++	 * Now all data stripes are properly verified. Check if we have any
++	 * unrepaired, if so abort immediately or we could further corrupt the
++	 * P/Q stripes.
++	 *
++	 * During the loop, also populate extent_bitmap.
++	 */
++	for (int i = 0; i < data_stripes; i++) {
++		unsigned long error;
+ 
+-	mutex_lock(&sctx->wr_lock);
+-	if (sctx->write_pointer < physical_end) {
+-		ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
+-						    physical,
+-						    sctx->write_pointer);
+-		if (ret)
++		stripe = &sctx->raid56_data_stripes[i];
++
++		/*
++		 * We should only check the errors where there is an extent.
++		 * As we may hit an empty data stripe while it's missing.
++		 */
++		bitmap_and(&error, &stripe->error_bitmap,
++			   &stripe->extent_sector_bitmap, stripe->nr_sectors);
++		if (!bitmap_empty(&error, stripe->nr_sectors)) {
+ 			btrfs_err(fs_info,
+-				  "zoned: failed to recover write pointer");
++"unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl",
++				  full_stripe_start, i, stripe->nr_sectors,
++				  &error);
++			ret = -EIO;
++			goto out;
++		}
++		bitmap_or(&extent_bitmap, &extent_bitmap,
++			  &stripe->extent_sector_bitmap, stripe->nr_sectors);
+ 	}
+-	mutex_unlock(&sctx->wr_lock);
+-	btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
+ 
++	/* Now we can check and regenerate the P/Q stripe. */
++	bio = bio_alloc(NULL, 1, REQ_OP_READ, GFP_NOFS);
++	bio->bi_iter.bi_sector = full_stripe_start >> SECTOR_SHIFT;
++	bio->bi_private = &io_done;
++	bio->bi_end_io = raid56_scrub_wait_endio;
++
++	btrfs_bio_counter_inc_blocked(fs_info);
++	ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, full_stripe_start,
++			       &length, &bioc);
++	if (ret < 0) {
++		btrfs_put_bioc(bioc);
++		btrfs_bio_counter_dec(fs_info);
++		goto out;
++	}
++	rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, scrub_dev, &extent_bitmap,
++				BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits);
++	btrfs_put_bioc(bioc);
++	if (!rbio) {
++		ret = -ENOMEM;
++		btrfs_bio_counter_dec(fs_info);
++		goto out;
++	}
++	raid56_parity_submit_scrub_rbio(rbio);
++	wait_for_completion_io(&io_done);
++	ret = blk_status_to_errno(bio->bi_status);
++	bio_put(bio);
++	btrfs_bio_counter_dec(fs_info);
++
++out:
+ 	return ret;
+ }
+ 
+@@ -3410,8 +1963,6 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
+  * and @logical_length parameter.
+  */
+ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+-			       struct btrfs_root *extent_root,
+-			       struct btrfs_root *csum_root,
+ 			       struct btrfs_block_group *bg,
+ 			       struct map_lookup *map,
+ 			       u64 logical_start, u64 logical_length,
+@@ -3421,7 +1972,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ 	struct btrfs_fs_info *fs_info = sctx->fs_info;
+ 	const u64 logical_end = logical_start + logical_length;
+ 	/* An artificial limit, inherit from old scrub behavior */
+-	const u32 max_length = SZ_64K;
+ 	struct btrfs_path path = { 0 };
+ 	u64 cur_logical = logical_start;
+ 	int ret;
+@@ -3433,11 +1983,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ 	path.skip_locking = 1;
+ 	/* Go through each extent items inside the logical range */
+ 	while (cur_logical < logical_end) {
+-		u64 extent_start;
+-		u64 extent_len;
+-		u64 extent_flags;
+-		u64 extent_gen;
+-		u64 scrub_len;
++		u64 cur_physical = physical + cur_logical - logical_start;
+ 
+ 		/* Canceled? */
+ 		if (atomic_read(&fs_info->scrub_cancel_req) ||
+@@ -3448,14 +1994,6 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ 		/* Paused? */
+ 		if (atomic_read(&fs_info->scrub_pause_req)) {
+ 			/* Push queued extents */
+-			sctx->flush_all_writes = true;
+-			scrub_submit(sctx);
+-			mutex_lock(&sctx->wr_lock);
+-			scrub_wr_submit(sctx);
+-			mutex_unlock(&sctx->wr_lock);
+-			wait_event(sctx->list_wait,
+-				   atomic_read(&sctx->bios_in_flight) == 0);
+-			sctx->flush_all_writes = false;
+ 			scrub_blocked_if_needed(fs_info);
+ 		}
+ 		/* Block group removed? */
+@@ -3467,8 +2005,9 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ 		}
+ 		spin_unlock(&bg->lock);
+ 
+-		ret = find_first_extent_item(extent_root, &path, cur_logical,
+-					     logical_end - cur_logical);
++		ret = queue_scrub_stripe(sctx, bg, device, mirror_num,
++					 cur_logical, logical_end - cur_logical,
++					 cur_physical);
+ 		if (ret > 0) {
+ 			/* No more extent, just update the accounting */
+ 			sctx->stat.last_physical = physical + logical_length;
+@@ -3477,52 +2016,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
+ 		}
+ 		if (ret < 0)
+ 			break;
+-		get_extent_info(&path, &extent_start, &extent_len,
+-				&extent_flags, &extent_gen);
+-		/* Skip hole range which doesn't have any extent */
+-		cur_logical = max(extent_start, cur_logical);
+ 
+-		/*
+-		 * Scrub len has three limits:
+-		 * - Extent size limit
+-		 * - Scrub range limit
+-		 *   This is especially imporatant for RAID0/RAID10 to reuse
+-		 *   this function
+-		 * - Max scrub size limit
+-		 */
+-		scrub_len = min(min(extent_start + extent_len,
+-				    logical_end), cur_logical + max_length) -
+-			    cur_logical;
+-
+-		if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
+-			ret = btrfs_lookup_csums_list(csum_root, cur_logical,
+-					cur_logical + scrub_len - 1,
+-					&sctx->csum_list, 1, false);
+-			if (ret)
+-				break;
+-		}
+-		if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
+-		    does_range_cross_boundary(extent_start, extent_len,
+-					      logical_start, logical_length)) {
+-			btrfs_err(fs_info,
+-"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
+-				  extent_start, logical_start, logical_end);
+-			spin_lock(&sctx->stat_lock);
+-			sctx->stat.uncorrectable_errors++;
+-			spin_unlock(&sctx->stat_lock);
+-			cur_logical += scrub_len;
+-			continue;
+-		}
+-		ret = scrub_extent(sctx, map, cur_logical, scrub_len,
+-				   cur_logical - logical_start + physical,
+-				   device, extent_flags, extent_gen,
+-				   mirror_num);
+-		scrub_free_csums(sctx);
+-		if (ret)
+-			break;
+-		if (sctx->is_dev_replace)
+-			sync_replace_for_zoned(sctx);
+-		cur_logical += scrub_len;
++		ASSERT(sctx->cur_stripe > 0);
++		cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical
++			      + BTRFS_STRIPE_LEN;
++
+ 		/* Don't hold CPU for too long time */
+ 		cond_resched();
+ 	}
+@@ -3536,7 +2034,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+ 	ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ 			    BTRFS_BLOCK_GROUP_RAID10));
+ 
+-	return map->num_stripes / map->sub_stripes * map->stripe_len;
++	return (map->num_stripes / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
+ }
+ 
+ /* Get the logical bytenr for the stripe */
+@@ -3552,7 +2050,8 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
+ 	 * (stripe_index / sub_stripes) gives how many data stripes we need to
+ 	 * skip.
+ 	 */
+-	return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
++	return ((stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT) +
++	       bg->start;
+ }
+ 
+ /* Get the mirror number for the stripe */
+@@ -3567,8 +2066,6 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+ }
+ 
+ static int scrub_simple_stripe(struct scrub_ctx *sctx,
+-			       struct btrfs_root *extent_root,
+-			       struct btrfs_root *csum_root,
+ 			       struct btrfs_block_group *bg,
+ 			       struct map_lookup *map,
+ 			       struct btrfs_device *device,
+@@ -3588,15 +2085,15 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
+ 		 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
+ 		 * this stripe.
+ 		 */
+-		ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
+-					  cur_logical, map->stripe_len, device,
+-					  cur_physical, mirror_num);
++		ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
++					  BTRFS_STRIPE_LEN, device, cur_physical,
++					  mirror_num);
+ 		if (ret)
+ 			return ret;
+ 		/* Skip to next stripe which belongs to the target device */
+ 		cur_logical += logical_increment;
+ 		/* For physical offset, we just go to next stripe */
+-		cur_physical += map->stripe_len;
++		cur_physical += BTRFS_STRIPE_LEN;
+ 	}
+ 	return ret;
+ }
+@@ -3607,15 +2104,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 					   struct btrfs_device *scrub_dev,
+ 					   int stripe_index)
+ {
+-	struct btrfs_path *path;
+ 	struct btrfs_fs_info *fs_info = sctx->fs_info;
+-	struct btrfs_root *root;
+-	struct btrfs_root *csum_root;
+-	struct blk_plug plug;
+ 	struct map_lookup *map = em->map_lookup;
+ 	const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
+ 	const u64 chunk_logical = bg->start;
+ 	int ret;
++	int ret2;
+ 	u64 physical = map->stripes[stripe_index].physical;
+ 	const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ 	const u64 physical_end = physical + dev_stripe_len;
+@@ -3626,43 +2120,37 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 	/* Offset inside the chunk */
+ 	u64 offset;
+ 	u64 stripe_logical;
+-	u64 stripe_end;
+ 	int stop_loop = 0;
+ 
+-	path = btrfs_alloc_path();
+-	if (!path)
+-		return -ENOMEM;
+-
+-	/*
+-	 * work on commit root. The related disk blocks are static as
+-	 * long as COW is applied. This means, it is save to rewrite
+-	 * them to repair disk errors without any race conditions
+-	 */
+-	path->search_commit_root = 1;
+-	path->skip_locking = 1;
+-	path->reada = READA_FORWARD;
+-
+-	wait_event(sctx->list_wait,
+-		   atomic_read(&sctx->bios_in_flight) == 0);
+ 	scrub_blocked_if_needed(fs_info);
+ 
+-	root = btrfs_extent_root(fs_info, bg->start);
+-	csum_root = btrfs_csum_root(fs_info, bg->start);
+-
+-	/*
+-	 * collect all data csums for the stripe to avoid seeking during
+-	 * the scrub. This might currently (crc32) end up to be about 1MB
+-	 */
+-	blk_start_plug(&plug);
+-
+ 	if (sctx->is_dev_replace &&
+ 	    btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
+ 		mutex_lock(&sctx->wr_lock);
+ 		sctx->write_pointer = physical;
+ 		mutex_unlock(&sctx->wr_lock);
+-		sctx->flush_all_writes = true;
+ 	}
+ 
++	/* Prepare the extra data stripes used by RAID56. */
++	if (profile & BTRFS_BLOCK_GROUP_RAID56_MASK) {
++		ASSERT(sctx->raid56_data_stripes == NULL);
++
++		sctx->raid56_data_stripes = kcalloc(nr_data_stripes(map),
++						    sizeof(struct scrub_stripe),
++						    GFP_KERNEL);
++		if (!sctx->raid56_data_stripes) {
++			ret = -ENOMEM;
++			goto out;
++		}
++		for (int i = 0; i < nr_data_stripes(map); i++) {
++			ret = init_scrub_stripe(fs_info,
++						&sctx->raid56_data_stripes[i]);
++			if (ret < 0)
++				goto out;
++			sctx->raid56_data_stripes[i].bg = bg;
++			sctx->raid56_data_stripes[i].sctx = sctx;
++		}
++	}
+ 	/*
+ 	 * There used to be a big double loop to handle all profiles using the
+ 	 * same routine, which grows larger and more gross over time.
+@@ -3680,17 +2168,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 		 * Only @physical and @mirror_num needs to calculated using
+ 		 * @stripe_index.
+ 		 */
+-		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+-				bg->start, bg->length, scrub_dev,
+-				map->stripes[stripe_index].physical,
++		ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
++				scrub_dev, map->stripes[stripe_index].physical,
+ 				stripe_index + 1);
+ 		offset = 0;
+ 		goto out;
+ 	}
+ 	if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+-		ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
+-					  scrub_dev, stripe_index);
+-		offset = map->stripe_len * (stripe_index / map->sub_stripes);
++		ret = scrub_simple_stripe(sctx, bg, map, scrub_dev, stripe_index);
++		offset = (stripe_index / map->sub_stripes) << BTRFS_STRIPE_LEN_SHIFT;
+ 		goto out;
+ 	}
+ 
+@@ -3705,7 +2191,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 
+ 	/* Initialize @offset in case we need to go to out: label */
+ 	get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
+-	increment = map->stripe_len * nr_data_stripes(map);
++	increment = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ 
+ 	/*
+ 	 * Due to the rotation, for RAID56 it's better to iterate each stripe
+@@ -3718,10 +2204,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 		if (ret) {
+ 			/* it is parity strip */
+ 			stripe_logical += chunk_logical;
+-			stripe_end = stripe_logical + increment;
+-			ret = scrub_raid56_parity(sctx, map, scrub_dev,
+-						  stripe_logical,
+-						  stripe_end);
++			ret = scrub_raid56_parity_stripe(sctx, scrub_dev, bg,
++							 map, stripe_logical);
+ 			if (ret)
+ 				goto out;
+ 			goto next;
+@@ -3735,14 +2219,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 		 * We can reuse scrub_simple_mirror() here, as the repair part
+ 		 * is still based on @mirror_num.
+ 		 */
+-		ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
+-					  logical, map->stripe_len,
++		ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
+ 					  scrub_dev, physical, 1);
+ 		if (ret < 0)
+ 			goto out;
+ next:
+ 		logical += increment;
+-		physical += map->stripe_len;
++		physical += BTRFS_STRIPE_LEN;
+ 		spin_lock(&sctx->stat_lock);
+ 		if (stop_loop)
+ 			sctx->stat.last_physical =
+@@ -3754,14 +2237,15 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+ 			break;
+ 	}
+ out:
+-	/* push queued extents */
+-	scrub_submit(sctx);
+-	mutex_lock(&sctx->wr_lock);
+-	scrub_wr_submit(sctx);
+-	mutex_unlock(&sctx->wr_lock);
+-
+-	blk_finish_plug(&plug);
+-	btrfs_free_path(path);
++	ret2 = flush_scrub_stripes(sctx);
++	if (!ret2)
++		ret = ret2;
++	if (sctx->raid56_data_stripes) {
++		for (int i = 0; i < nr_data_stripes(map); i++)
++			release_scrub_stripe(&sctx->raid56_data_stripes[i]);
++		kfree(sctx->raid56_data_stripes);
++		sctx->raid56_data_stripes = NULL;
++	}
+ 
+ 	if (sctx->is_dev_replace && ret >= 0) {
+ 		int ret2;
+@@ -4079,39 +2563,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+ 
+ 		ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
+ 				  dev_extent_len);
+-
+-		/*
+-		 * flush, submit all pending read and write bios, afterwards
+-		 * wait for them.
+-		 * Note that in the dev replace case, a read request causes
+-		 * write requests that are submitted in the read completion
+-		 * worker. Therefore in the current situation, it is required
+-		 * that all write requests are flushed, so that all read and
+-		 * write requests are really completed when bios_in_flight
+-		 * changes to 0.
+-		 */
+-		sctx->flush_all_writes = true;
+-		scrub_submit(sctx);
+-		mutex_lock(&sctx->wr_lock);
+-		scrub_wr_submit(sctx);
+-		mutex_unlock(&sctx->wr_lock);
+-
+-		wait_event(sctx->list_wait,
+-			   atomic_read(&sctx->bios_in_flight) == 0);
+-
+-		scrub_pause_on(fs_info);
+-
+-		/*
+-		 * must be called before we decrease @scrub_paused.
+-		 * make sure we don't block transaction commit while
+-		 * we are waiting pending workers finished.
+-		 */
+-		wait_event(sctx->list_wait,
+-			   atomic_read(&sctx->workers_pending) == 0);
+-		sctx->flush_all_writes = false;
+-
+-		scrub_pause_off(fs_info);
+-
+ 		if (sctx->is_dev_replace &&
+ 		    !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
+ 						      cache, found_key.offset))
+@@ -4168,18 +2619,62 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+ 	return ret;
+ }
+ 
++static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev,
++			   struct page *page, u64 physical, u64 generation)
++{
++	struct btrfs_fs_info *fs_info = sctx->fs_info;
++	struct bio_vec bvec;
++	struct bio bio;
++	struct btrfs_super_block *sb = page_address(page);
++	int ret;
++
++	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ);
++	bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT;
++	__bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0);
++	ret = submit_bio_wait(&bio);
++	bio_uninit(&bio);
++
++	if (ret < 0)
++		return ret;
++	ret = btrfs_check_super_csum(fs_info, sb);
++	if (ret != 0) {
++		btrfs_err_rl(fs_info,
++			"super block at physical %llu devid %llu has bad csum",
++			physical, dev->devid);
++		return -EIO;
++	}
++	if (btrfs_super_generation(sb) != generation) {
++		btrfs_err_rl(fs_info,
++"super block at physical %llu devid %llu has bad generation %llu expect %llu",
++			     physical, dev->devid,
++			     btrfs_super_generation(sb), generation);
++		return -EUCLEAN;
++	}
++
++	return btrfs_validate_super(fs_info, sb, -1);
++}
++
+ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+ 					   struct btrfs_device *scrub_dev)
+ {
+ 	int	i;
+ 	u64	bytenr;
+ 	u64	gen;
+-	int	ret;
++	int ret = 0;
++	struct page *page;
+ 	struct btrfs_fs_info *fs_info = sctx->fs_info;
+ 
+ 	if (BTRFS_FS_ERROR(fs_info))
+ 		return -EROFS;
+ 
++	page = alloc_page(GFP_KERNEL);
++	if (!page) {
++		spin_lock(&sctx->stat_lock);
++		sctx->stat.malloc_errors++;
++		spin_unlock(&sctx->stat_lock);
++		return -ENOMEM;
++	}
++
+ 	/* Seed devices of a new filesystem has their own generation. */
+ 	if (scrub_dev->fs_devices != fs_info->fs_devices)
+ 		gen = scrub_dev->generation;
+@@ -4194,14 +2689,14 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+ 		if (!btrfs_check_super_location(scrub_dev, bytenr))
+ 			continue;
+ 
+-		ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+-				    scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+-				    NULL, bytenr);
+-		if (ret)
+-			return ret;
++		ret = scrub_one_super(sctx, scrub_dev, page, bytenr, gen);
++		if (ret) {
++			spin_lock(&sctx->stat_lock);
++			sctx->stat.super_errors++;
++			spin_unlock(&sctx->stat_lock);
++		}
+ 	}
+-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+-
++	__free_page(page);
+ 	return 0;
+ }
+ 
+@@ -4212,20 +2707,15 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+ 		struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
+ 		struct workqueue_struct *scrub_wr_comp =
+ 						fs_info->scrub_wr_completion_workers;
+-		struct workqueue_struct *scrub_parity =
+-						fs_info->scrub_parity_workers;
+ 
+ 		fs_info->scrub_workers = NULL;
+ 		fs_info->scrub_wr_completion_workers = NULL;
+-		fs_info->scrub_parity_workers = NULL;
+ 		mutex_unlock(&fs_info->scrub_lock);
+ 
+ 		if (scrub_workers)
+ 			destroy_workqueue(scrub_workers);
+ 		if (scrub_wr_comp)
+ 			destroy_workqueue(scrub_wr_comp);
+-		if (scrub_parity)
+-			destroy_workqueue(scrub_parity);
+ 	}
+ }
+ 
+@@ -4237,7 +2727,6 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ {
+ 	struct workqueue_struct *scrub_workers = NULL;
+ 	struct workqueue_struct *scrub_wr_comp = NULL;
+-	struct workqueue_struct *scrub_parity = NULL;
+ 	unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
+ 	int max_active = fs_info->thread_pool_size;
+ 	int ret = -ENOMEM;
+@@ -4254,18 +2743,12 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ 	if (!scrub_wr_comp)
+ 		goto fail_scrub_wr_completion_workers;
+ 
+-	scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
+-	if (!scrub_parity)
+-		goto fail_scrub_parity_workers;
+-
+ 	mutex_lock(&fs_info->scrub_lock);
+ 	if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ 		ASSERT(fs_info->scrub_workers == NULL &&
+-		       fs_info->scrub_wr_completion_workers == NULL &&
+-		       fs_info->scrub_parity_workers == NULL);
++		       fs_info->scrub_wr_completion_workers == NULL);
+ 		fs_info->scrub_workers = scrub_workers;
+ 		fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+-		fs_info->scrub_parity_workers = scrub_parity;
+ 		refcount_set(&fs_info->scrub_workers_refcnt, 1);
+ 		mutex_unlock(&fs_info->scrub_lock);
+ 		return 0;
+@@ -4275,8 +2758,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+ 	mutex_unlock(&fs_info->scrub_lock);
+ 
+ 	ret = 0;
+-	destroy_workqueue(scrub_parity);
+-fail_scrub_parity_workers:
++
+ 	destroy_workqueue(scrub_wr_comp);
+ fail_scrub_wr_completion_workers:
+ 	destroy_workqueue(scrub_workers);
+@@ -4411,12 +2893,9 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+ 		ret = scrub_enumerate_chunks(sctx, dev, start, end);
+ 	memalloc_nofs_restore(nofs_flag);
+ 
+-	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
+ 	atomic_dec(&fs_info->scrubs_running);
+ 	wake_up(&fs_info->scrub_pause_wait);
+ 
+-	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
+-
+ 	if (progress)
+ 		memcpy(progress, &sctx->stat, sizeof(*progress));
+ 
+@@ -4541,28 +3020,3 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
+ 
+ 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+ }
+-
+-static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
+-				 u64 extent_logical, u32 extent_len,
+-				 u64 *extent_physical,
+-				 struct btrfs_device **extent_dev,
+-				 int *extent_mirror_num)
+-{
+-	u64 mapped_length;
+-	struct btrfs_io_context *bioc = NULL;
+-	int ret;
+-
+-	mapped_length = extent_len;
+-	ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
+-			      &mapped_length, &bioc, 0);
+-	if (ret || !bioc || mapped_length < extent_len ||
+-	    !bioc->stripes[0].dev->bdev) {
+-		btrfs_put_bioc(bioc);
+-		return;
+-	}
+-
+-	*extent_physical = bioc->stripes[0].physical;
+-	*extent_mirror_num = bioc->mirror_num;
+-	*extent_dev = bioc->stripes[0].dev;
+-	btrfs_put_bioc(bioc);
+-}
+diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
+index e5c963bb873d..af2e153543a5 100644
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -1875,7 +1875,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
+ 	int left_ret;
+ 	int right_ret;
+ 	u64 left_gen;
+-	u64 right_gen;
++	u64 right_gen = 0;
+ 	struct btrfs_inode_info info;
+ 
+ 	ret = get_inode_info(sctx->send_root, ino, &info);
+diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
+index 3eecce86f63f..75e7fa337e66 100644
+--- a/fs/btrfs/space-info.c
++++ b/fs/btrfs/space-info.c
+@@ -537,7 +537,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
+ 	up_read(&info->groups_sem);
+ }
+ 
+-static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
++static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info,
+ 					u64 to_reclaim)
+ {
+ 	u64 bytes;
+@@ -550,6 +550,18 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
+ 	return nr;
+ }
+ 
++static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info,
++				       u64 to_reclaim)
++{
++	const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1);
++	u64 nr;
++
++	nr = div64_u64(to_reclaim, bytes);
++	if (!nr)
++		nr = 1;
++	return nr;
++}
++
+ #define EXTENT_SIZE_PER_ITEM	SZ_256K
+ 
+ /*
+@@ -727,7 +739,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
+ 			break;
+ 		}
+ 		if (state == FLUSH_DELAYED_REFS_NR)
+-			nr = calc_reclaim_items_nr(fs_info, num_bytes);
++			nr = calc_delayed_refs_nr(fs_info, num_bytes);
+ 		else
+ 			nr = 0;
+ 		btrfs_run_delayed_refs(trans, nr);
+@@ -1599,11 +1611,22 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ 	struct reserve_ticket ticket;
+ 	u64 start_ns = 0;
+ 	u64 used;
+-	int ret = 0;
++	int ret = -ENOSPC;
+ 	bool pending_tickets;
+ 
+ 	ASSERT(orig_bytes);
+-	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
++	/*
++	 * If have a transaction handle (current->journal_info != NULL), then
++	 * the flush method can not be neither BTRFS_RESERVE_FLUSH_ALL* nor
++	 * BTRFS_RESERVE_FLUSH_EVICT, as we could deadlock because those
++	 * flushing methods can trigger transaction commits.
++	 */
++	if (current->journal_info) {
++		/* One assert per line for easier debugging. */
++		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL);
++		ASSERT(flush != BTRFS_RESERVE_FLUSH_ALL_STEAL);
++		ASSERT(flush != BTRFS_RESERVE_FLUSH_EVICT);
++	}
+ 
+ 	if (flush == BTRFS_RESERVE_FLUSH_DATA)
+ 		async_work = &fs_info->async_data_reclaim_work;
+@@ -1611,7 +1634,6 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
+ 		async_work = &fs_info->async_reclaim_work;
+ 
+ 	spin_lock(&space_info->lock);
+-	ret = -ENOSPC;
+ 	used = btrfs_space_info_used(space_info, true);
+ 
+ 	/*
+diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
+index 2033b71b18ce..0bb9d14e60a8 100644
+--- a/fs/btrfs/space-info.h
++++ b/fs/btrfs/space-info.h
+@@ -27,6 +27,7 @@ enum btrfs_reserve_flush_enum {
+ 	 * - Running delayed refs
+ 	 * - Running delalloc and waiting for ordered extents
+ 	 * - Allocating a new chunk
++	 * - Committing transaction
+ 	 */
+ 	BTRFS_RESERVE_FLUSH_EVICT,
+ 
+diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
+index 366fb4cde145..6cb97efee976 100644
+--- a/fs/btrfs/super.c
++++ b/fs/btrfs/super.c
+@@ -1158,6 +1158,7 @@ static int btrfs_fill_super(struct super_block *sb,
+ 	inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
+ 	if (IS_ERR(inode)) {
+ 		err = PTR_ERR(inode);
++		btrfs_handle_fs_error(fs_info, err, NULL);
+ 		goto fail_close;
+ 	}
+ 
+@@ -2412,7 +2413,7 @@ static int __init btrfs_print_mod_info(void)
+ 			", fsverity=no"
+ #endif
+ 			;
+-	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
++	pr_info("Btrfs loaded%s\n", options);
+ 	return 0;
+ }
+ 
+diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
+index 37fc58a7f27e..25294e624851 100644
+--- a/fs/btrfs/sysfs.c
++++ b/fs/btrfs/sysfs.c
+@@ -1262,8 +1262,13 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
+ 	if (ret)
+ 		return ret;
+ 
++#ifdef CONFIG_BTRFS_DEBUG
++	if (thresh != 0 && (thresh > 100))
++		return -EINVAL;
++#else
+ 	if (thresh != 0 && (thresh <= 50 || thresh > 100))
+ 		return -EINVAL;
++#endif
+ 
+ 	WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh);
+ 
+diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
+index f2f2e11dac4c..ed0f36ae5346 100644
+--- a/fs/btrfs/tests/extent-map-tests.c
++++ b/fs/btrfs/tests/extent-map-tests.c
+@@ -486,7 +486,6 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
+ 	em->map_lookup = map;
+ 
+ 	map->num_stripes = test->num_stripes;
+-	map->stripe_len = BTRFS_STRIPE_LEN;
+ 	map->type = test->raid_type;
+ 
+ 	for (i = 0; i < map->num_stripes; i++) {
+diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
+index b8d5b1fa9a03..8b6a99b8d7f6 100644
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -601,15 +601,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
+ 		/*
+ 		 * We want to reserve all the bytes we may need all at once, so
+ 		 * we only do 1 enospc flushing cycle per transaction start.  We
+-		 * accomplish this by simply assuming we'll do 2 x num_items
+-		 * worth of delayed refs updates in this trans handle, and
+-		 * refill that amount for whatever is missing in the reserve.
++		 * accomplish this by simply assuming we'll do num_items worth
++		 * of delayed refs updates in this trans handle, and refill that
++		 * amount for whatever is missing in the reserve.
+ 		 */
+ 		num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
+ 		if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+-		    btrfs_block_rsv_full(delayed_refs_rsv) == 0) {
+-			delayed_refs_bytes = num_bytes;
+-			num_bytes <<= 1;
++		    !btrfs_block_rsv_full(delayed_refs_rsv)) {
++			delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info,
++									  num_items);
++			num_bytes += delayed_refs_bytes;
+ 		}
+ 
+ 		/*
+@@ -942,16 +943,6 @@ void btrfs_throttle(struct btrfs_fs_info *fs_info)
+ 	wait_current_trans(fs_info);
+ }
+ 
+-static bool should_end_transaction(struct btrfs_trans_handle *trans)
+-{
+-	struct btrfs_fs_info *fs_info = trans->fs_info;
+-
+-	if (btrfs_check_space_for_delayed_refs(fs_info))
+-		return true;
+-
+-	return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
+-}
+-
+ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+ {
+ 	struct btrfs_transaction *cur_trans = trans->transaction;
+@@ -960,7 +951,10 @@ bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
+ 	    test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags))
+ 		return true;
+ 
+-	return should_end_transaction(trans);
++	if (btrfs_check_space_for_delayed_refs(trans->fs_info))
++		return true;
++
++	return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50);
+ }
+ 
+ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans)
+diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
+index baad1ed7e111..e2b54793bf0c 100644
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -849,6 +849,20 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
+ 			  stripe_len);
+ 		return -EUCLEAN;
+ 	}
++	/*
++	 * We artificially limit the chunk size, so that the number of stripes
++	 * inside a chunk can be fit into a U32.  The current limit (256G) is
++	 * way too large for real world usage anyway, and it's also much larger
++	 * than our existing limit (10G).
++	 *
++	 * Thus it should be a good way to catch obvious bitflips.
++	 */
++	if (unlikely(length >= ((u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT))) {
++		chunk_err(leaf, chunk, logical,
++			  "chunk length too large: have %llu limit %llu",
++			  length, (u64)U32_MAX << BTRFS_STRIPE_LEN_SHIFT);
++		return -EUCLEAN;
++	}
+ 	if (unlikely(type & ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+ 			      BTRFS_BLOCK_GROUP_PROFILE_MASK))) {
+ 		chunk_err(leaf, chunk, logical,
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 200cea6e49e5..9b212e8c70cc 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2563,6 +2563,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+ 	btrfs_put_block_group(cache);
+ }
+ 
++static int clean_log_buffer(struct btrfs_trans_handle *trans,
++			    struct extent_buffer *eb)
++{
++	int ret;
++
++	btrfs_tree_lock(eb);
++	btrfs_clear_buffer_dirty(trans, eb);
++	wait_on_extent_buffer_writeback(eb);
++	btrfs_tree_unlock(eb);
++
++	if (trans) {
++		ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len);
++		if (ret)
++			return ret;
++		btrfs_redirty_list_add(trans->transaction, eb);
++	} else {
++		unaccount_log_buffer(eb->fs_info, eb->start);
++	}
++
++	return 0;
++}
++
+ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 				   struct btrfs_root *root,
+ 				   struct btrfs_path *path, int *level,
+@@ -2573,7 +2595,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 	u64 ptr_gen;
+ 	struct extent_buffer *next;
+ 	struct extent_buffer *cur;
+-	u32 blocksize;
+ 	int ret = 0;
+ 
+ 	while (*level > 0) {
+@@ -2593,7 +2614,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 		check.level = *level - 1;
+ 		check.has_first_key = true;
+ 		btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]);
+-		blocksize = fs_info->nodesize;
+ 
+ 		next = btrfs_find_create_tree_block(fs_info, bytenr,
+ 						    btrfs_header_owner(cur),
+@@ -2617,22 +2637,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
+ 					return ret;
+ 				}
+ 
+-				btrfs_tree_lock(next);
+-				btrfs_clear_buffer_dirty(trans, next);
+-				wait_on_extent_buffer_writeback(next);
+-				btrfs_tree_unlock(next);
+-
+-				if (trans) {
+-					ret = btrfs_pin_reserved_extent(trans,
+-							bytenr, blocksize);
+-					if (ret) {
+-						free_extent_buffer(next);
+-						return ret;
+-					}
+-					btrfs_redirty_list_add(
+-						trans->transaction, next);
+-				} else {
+-					unaccount_log_buffer(fs_info, bytenr);
++				ret = clean_log_buffer(trans, next);
++				if (ret) {
++					free_extent_buffer(next);
++					return ret;
+ 				}
+ 			}
+ 			free_extent_buffer(next);
+@@ -2662,7 +2670,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ 				 struct btrfs_path *path, int *level,
+ 				 struct walk_control *wc)
+ {
+-	struct btrfs_fs_info *fs_info = root->fs_info;
+ 	int i;
+ 	int slot;
+ 	int ret;
+@@ -2682,27 +2689,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ 				return ret;
+ 
+ 			if (wc->free) {
+-				struct extent_buffer *next;
+-
+-				next = path->nodes[*level];
+-
+-				btrfs_tree_lock(next);
+-				btrfs_clear_buffer_dirty(trans, next);
+-				wait_on_extent_buffer_writeback(next);
+-				btrfs_tree_unlock(next);
+-
+-				if (trans) {
+-					ret = btrfs_pin_reserved_extent(trans,
+-						     path->nodes[*level]->start,
+-						     path->nodes[*level]->len);
+-					if (ret)
+-						return ret;
+-					btrfs_redirty_list_add(trans->transaction,
+-							       next);
+-				} else {
+-					unaccount_log_buffer(fs_info,
+-						path->nodes[*level]->start);
+-				}
++				ret = clean_log_buffer(trans, path->nodes[*level]);
++				if (ret)
++					return ret;
+ 			}
+ 			free_extent_buffer(path->nodes[*level]);
+ 			path->nodes[*level] = NULL;
+@@ -2720,7 +2709,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
+ static int walk_log_tree(struct btrfs_trans_handle *trans,
+ 			 struct btrfs_root *log, struct walk_control *wc)
+ {
+-	struct btrfs_fs_info *fs_info = log->fs_info;
+ 	int ret = 0;
+ 	int wret;
+ 	int level;
+@@ -2762,26 +2750,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
+ 			 orig_level);
+ 		if (ret)
+ 			goto out;
+-		if (wc->free) {
+-			struct extent_buffer *next;
+-
+-			next = path->nodes[orig_level];
+-
+-			btrfs_tree_lock(next);
+-			btrfs_clear_buffer_dirty(trans, next);
+-			wait_on_extent_buffer_writeback(next);
+-			btrfs_tree_unlock(next);
+-
+-			if (trans) {
+-				ret = btrfs_pin_reserved_extent(trans,
+-						next->start, next->len);
+-				if (ret)
+-					goto out;
+-				btrfs_redirty_list_add(trans->transaction, next);
+-			} else {
+-				unaccount_log_buffer(fs_info, next->start);
+-			}
+-		}
++		if (wc->free)
++			ret = clean_log_buffer(trans, path->nodes[orig_level]);
+ 	}
+ 
+ out:
+@@ -3648,6 +3618,9 @@ static int flush_dir_items_batch(struct btrfs_trans_handle *trans,
+ 		ret = BTRFS_LOG_FORCE_COMMIT;
+ 	else
+ 		inode->last_dir_index_offset = last_index;
++
++	if (btrfs_get_first_dir_index_to_log(inode) == 0)
++		btrfs_set_first_dir_index_to_log(inode, batch.keys[0].offset);
+ out:
+ 	kfree(ins_data);
+ 
+@@ -4099,7 +4072,7 @@ static int drop_inode_items(struct btrfs_trans_handle *trans,
+ 
+ 		found_key.offset = 0;
+ 		found_key.type = 0;
+-		ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
++		ret = btrfs_bin_search(path->nodes[0], 0, &found_key, &start_slot);
+ 		if (ret < 0)
+ 			break;
+ 
+@@ -5406,6 +5379,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 	LIST_HEAD(dir_list);
+ 	struct btrfs_dir_list *dir_elem;
+ 	u64 ino = btrfs_ino(start_inode);
++	struct btrfs_inode *curr_inode = start_inode;
+ 	int ret = 0;
+ 
+ 	/*
+@@ -5420,43 +5394,39 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 	if (!path)
+ 		return -ENOMEM;
+ 
++	/* Pairs with btrfs_add_delayed_iput below. */
++	ihold(&curr_inode->vfs_inode);
++
+ 	while (true) {
+-		struct extent_buffer *leaf;
+-		struct btrfs_key min_key;
++		struct inode *vfs_inode;
++		struct btrfs_key key;
++		struct btrfs_key found_key;
++		u64 next_index;
+ 		bool continue_curr_inode = true;
+-		int nritems;
+-		int i;
++		int iter_ret;
+ 
+-		min_key.objectid = ino;
+-		min_key.type = BTRFS_DIR_INDEX_KEY;
+-		min_key.offset = 0;
++		key.objectid = ino;
++		key.type = BTRFS_DIR_INDEX_KEY;
++		key.offset = btrfs_get_first_dir_index_to_log(curr_inode);
++		next_index = key.offset;
+ again:
+-		btrfs_release_path(path);
+-		ret = btrfs_search_forward(root, &min_key, path, trans->transid);
+-		if (ret < 0) {
+-			break;
+-		} else if (ret > 0) {
+-			ret = 0;
+-			goto next;
+-		}
+-
+-		leaf = path->nodes[0];
+-		nritems = btrfs_header_nritems(leaf);
+-		for (i = path->slots[0]; i < nritems; i++) {
++		btrfs_for_each_slot(root->log_root, &key, &found_key, path, iter_ret) {
++			struct extent_buffer *leaf = path->nodes[0];
+ 			struct btrfs_dir_item *di;
+ 			struct btrfs_key di_key;
+ 			struct inode *di_inode;
+ 			int log_mode = LOG_INODE_EXISTS;
+ 			int type;
+ 
+-			btrfs_item_key_to_cpu(leaf, &min_key, i);
+-			if (min_key.objectid != ino ||
+-			    min_key.type != BTRFS_DIR_INDEX_KEY) {
++			if (found_key.objectid != ino ||
++			    found_key.type != BTRFS_DIR_INDEX_KEY) {
+ 				continue_curr_inode = false;
+ 				break;
+ 			}
+ 
+-			di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
++			next_index = found_key.offset + 1;
++
++			di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+ 			type = btrfs_dir_ftype(leaf, di);
+ 			if (btrfs_dir_transid(leaf, di) < trans->transid)
+ 				continue;
+@@ -5496,12 +5466,24 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 			break;
+ 		}
+ 
+-		if (continue_curr_inode && min_key.offset < (u64)-1) {
+-			min_key.offset++;
++		btrfs_release_path(path);
++
++		if (iter_ret < 0) {
++			ret = iter_ret;
++			goto out;
++		} else if (iter_ret > 0) {
++			continue_curr_inode = false;
++		} else {
++			key = found_key;
++		}
++
++		if (continue_curr_inode && key.offset < (u64)-1) {
++			key.offset++;
+ 			goto again;
+ 		}
+ 
+-next:
++		btrfs_set_first_dir_index_to_log(curr_inode, next_index);
++
+ 		if (list_empty(&dir_list))
+ 			break;
+ 
+@@ -5509,9 +5491,22 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
+ 		ino = dir_elem->ino;
+ 		list_del(&dir_elem->list);
+ 		kfree(dir_elem);
++
++		btrfs_add_delayed_iput(curr_inode);
++		curr_inode = NULL;
++
++		vfs_inode = btrfs_iget(fs_info->sb, ino, root);
++		if (IS_ERR(vfs_inode)) {
++			ret = PTR_ERR(vfs_inode);
++			break;
++		}
++		curr_inode = BTRFS_I(vfs_inode);
+ 	}
+ out:
+ 	btrfs_free_path(path);
++	if (curr_inode)
++		btrfs_add_delayed_iput(curr_inode);
++
+ 	if (ret) {
+ 		struct btrfs_dir_list *next;
+ 
+diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
+index c6d592870400..03f52e4a20aa 100644
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -395,7 +395,6 @@ void btrfs_free_device(struct btrfs_device *device)
+ {
+ 	WARN_ON(!list_empty(&device->post_commit_list));
+ 	rcu_string_free(device->name);
+-	extent_io_tree_release(&device->alloc_state);
+ 	btrfs_destroy_dev_zone_info(device);
+ 	kfree(device);
+ }
+@@ -1150,10 +1149,10 @@ static void btrfs_close_one_device(struct btrfs_device *device)
+ 	device->last_flush_error = 0;
+ 
+ 	/* Verify the device is back in a pristine state  */
+-	ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
+-	ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
+-	ASSERT(list_empty(&device->dev_alloc_list));
+-	ASSERT(list_empty(&device->post_commit_list));
++	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
++	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
++	WARN_ON(!list_empty(&device->dev_alloc_list));
++	WARN_ON(!list_empty(&device->post_commit_list));
+ }
+ 
+ static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
+@@ -2618,7 +2617,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
+ 	struct block_device *bdev;
+ 	struct super_block *sb = fs_info->sb;
+ 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+-	struct btrfs_fs_devices *seed_devices;
++	struct btrfs_fs_devices *seed_devices = NULL;
+ 	u64 orig_super_total_bytes;
+ 	u64 orig_super_num_devices;
+ 	int ret = 0;
+@@ -5125,7 +5124,7 @@ static void init_alloc_chunk_ctl_policy_regular(
+ 	/* We don't want a chunk larger than 10% of writable space */
+ 	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
+ 				  ctl->max_chunk_size);
+-	ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
++	ctl->dev_extent_min = ctl->dev_stripes << BTRFS_STRIPE_LEN_SHIFT;
+ }
+ 
+ static void init_alloc_chunk_ctl_policy_zoned(
+@@ -5407,7 +5406,6 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ 						   j * ctl->stripe_size;
+ 		}
+ 	}
+-	map->stripe_len = BTRFS_STRIPE_LEN;
+ 	map->io_align = BTRFS_STRIPE_LEN;
+ 	map->io_width = BTRFS_STRIPE_LEN;
+ 	map->type = type;
+@@ -5438,7 +5436,7 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
+ 	}
+ 	write_unlock(&em_tree->lock);
+ 
+-	block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
++	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
+ 	if (IS_ERR(block_group))
+ 		goto error_del_extent;
+ 
+@@ -5615,11 +5613,11 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
+ 
+ 	btrfs_set_stack_chunk_length(chunk, bg->length);
+ 	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
+-	btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
++	btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
+ 	btrfs_set_stack_chunk_type(chunk, map->type);
+ 	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
+-	btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
+-	btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
++	btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
++	btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
+ 	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
+ 	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
+ 
+@@ -5784,13 +5782,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
+ 		 */
+ 		ret = map->num_stripes;
+ 	free_extent_map(em);
+-
+-	down_read(&fs_info->dev_replace.rwsem);
+-	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
+-	    fs_info->dev_replace.tgtdev)
+-		ret++;
+-	up_read(&fs_info->dev_replace.rwsem);
+-
+ 	return ret;
+ }
+ 
+@@ -5809,7 +5800,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
+ 	if (!WARN_ON(IS_ERR(em))) {
+ 		map = em->map_lookup;
+ 		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
+-			len = map->stripe_len * nr_data_stripes(map);
++			len = nr_data_stripes(map) << BTRFS_STRIPE_LEN_SHIFT;
+ 		free_extent_map(em);
+ 	}
+ 	return len;
+@@ -5895,41 +5886,16 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
+ 	return preferred_mirror;
+ }
+ 
+-/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+-static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes)
+-{
+-	int i;
+-	int again = 1;
+-
+-	while (again) {
+-		again = 0;
+-		for (i = 0; i < num_stripes - 1; i++) {
+-			/* Swap if parity is on a smaller index */
+-			if (bioc->raid_map[i] > bioc->raid_map[i + 1]) {
+-				swap(bioc->stripes[i], bioc->stripes[i + 1]);
+-				swap(bioc->raid_map[i], bioc->raid_map[i + 1]);
+-				again = 1;
+-			}
+-		}
+-	}
+-}
+-
+ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+-						       int total_stripes,
+-						       int real_stripes)
++						       u16 total_stripes)
+ {
+-	struct btrfs_io_context *bioc = kzalloc(
++	struct btrfs_io_context *bioc;
++
++	bioc = kzalloc(
+ 		 /* The size of btrfs_io_context */
+ 		sizeof(struct btrfs_io_context) +
+ 		/* Plus the variable array for the stripes */
+-		sizeof(struct btrfs_io_stripe) * (total_stripes) +
+-		/* Plus the variable array for the tgt dev */
+-		sizeof(int) * (real_stripes) +
+-		/*
+-		 * Plus the raid_map, which includes both the tgt dev
+-		 * and the stripes.
+-		 */
+-		sizeof(u64) * (total_stripes),
++		sizeof(struct btrfs_io_stripe) * (total_stripes),
+ 		GFP_NOFS);
+ 
+ 	if (!bioc)
+@@ -5938,8 +5904,8 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_
+ 	refcount_set(&bioc->refs, 1);
+ 
+ 	bioc->fs_info = fs_info;
+-	bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes);
+-	bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes);
++	bioc->replace_stripe_src = -1;
++	bioc->full_stripe_logical = (u64)-1;
+ 
+ 	return bioc;
+ }
+@@ -5971,16 +5937,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 	struct btrfs_discard_stripe *stripes;
+ 	u64 length = *length_ret;
+ 	u64 offset;
+-	u64 stripe_nr;
+-	u64 stripe_nr_end;
++	u32 stripe_nr;
++	u32 stripe_nr_end;
++	u32 stripe_cnt;
+ 	u64 stripe_end_offset;
+-	u64 stripe_cnt;
+-	u64 stripe_len;
+ 	u64 stripe_offset;
+ 	u32 stripe_index;
+ 	u32 factor = 0;
+ 	u32 sub_stripes = 0;
+-	u64 stripes_per_dev = 0;
++	u32 stripes_per_dev = 0;
+ 	u32 remaining_stripes = 0;
+ 	u32 last_stripe = 0;
+ 	int ret;
+@@ -5996,26 +5961,25 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ 		ret = -EOPNOTSUPP;
+ 		goto out_free_map;
+-}
++	}
+ 
+ 	offset = logical - em->start;
+ 	length = min_t(u64, em->start + em->len - logical, length);
+ 	*length_ret = length;
+ 
+-	stripe_len = map->stripe_len;
+ 	/*
+ 	 * stripe_nr counts the total number of stripes we have to stride
+ 	 * to get to this block
+ 	 */
+-	stripe_nr = div64_u64(offset, stripe_len);
++	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ 
+ 	/* stripe_offset is the offset of this block in its stripe */
+-	stripe_offset = offset - stripe_nr * stripe_len;
++	stripe_offset = offset - (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ 
+-	stripe_nr_end = round_up(offset + length, map->stripe_len);
+-	stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
++	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
++			BTRFS_STRIPE_LEN_SHIFT;
+ 	stripe_cnt = stripe_nr_end - stripe_nr;
+-	stripe_end_offset = stripe_nr_end * map->stripe_len -
++	stripe_end_offset = (stripe_nr_end << BTRFS_STRIPE_LEN_SHIFT) -
+ 			    (offset + length);
+ 	/*
+ 	 * after this, stripe_nr is the number of stripes on this
+@@ -6034,18 +5998,19 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 		factor = map->num_stripes / sub_stripes;
+ 		*num_stripes = min_t(u64, map->num_stripes,
+ 				    sub_stripes * stripe_cnt);
+-		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
++		stripe_index = stripe_nr % factor;
++		stripe_nr /= factor;
+ 		stripe_index *= sub_stripes;
+-		stripes_per_dev = div_u64_rem(stripe_cnt, factor,
+-					      &remaining_stripes);
+-		div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+-		last_stripe *= sub_stripes;
++
++		remaining_stripes = stripe_cnt % factor;
++		stripes_per_dev = stripe_cnt / factor;
++		last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
+ 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
+ 				BTRFS_BLOCK_GROUP_DUP)) {
+ 		*num_stripes = map->num_stripes;
+ 	} else {
+-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+-					&stripe_index);
++		stripe_index = stripe_nr % map->num_stripes;
++		stripe_nr /= map->num_stripes;
+ 	}
+ 
+ 	stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
+@@ -6057,15 +6022,15 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 	for (i = 0; i < *num_stripes; i++) {
+ 		stripes[i].physical =
+ 			map->stripes[stripe_index].physical +
+-			stripe_offset + stripe_nr * map->stripe_len;
++			stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ 		stripes[i].dev = map->stripes[stripe_index].dev;
+ 
+ 		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+ 				 BTRFS_BLOCK_GROUP_RAID10)) {
+-			stripes[i].length = stripes_per_dev * map->stripe_len;
++			stripes[i].length = stripes_per_dev << BTRFS_STRIPE_LEN_SHIFT;
+ 
+ 			if (i / sub_stripes < remaining_stripes)
+-				stripes[i].length += map->stripe_len;
++				stripes[i].length += BTRFS_STRIPE_LEN;
+ 
+ 			/*
+ 			 * Special for the first stripe and
+@@ -6103,83 +6068,6 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 	return ERR_PTR(ret);
+ }
+ 
+-/*
+- * In dev-replace case, for repair case (that's the only case where the mirror
+- * is selected explicitly when calling btrfs_map_block), blocks left of the
+- * left cursor can also be read from the target drive.
+- *
+- * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
+- * array of stripes.
+- * For READ, it also needs to be supported using the same mirror number.
+- *
+- * If the requested block is not left of the left cursor, EIO is returned. This
+- * can happen because btrfs_num_copies() returns one more in the dev-replace
+- * case.
+- */
+-static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
+-					 u64 logical, u64 length,
+-					 u64 srcdev_devid, int *mirror_num,
+-					 u64 *physical)
+-{
+-	struct btrfs_io_context *bioc = NULL;
+-	int num_stripes;
+-	int index_srcdev = 0;
+-	int found = 0;
+-	u64 physical_of_found = 0;
+-	int i;
+-	int ret = 0;
+-
+-	ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
+-				logical, &length, &bioc, NULL, NULL, 0);
+-	if (ret) {
+-		ASSERT(bioc == NULL);
+-		return ret;
+-	}
+-
+-	num_stripes = bioc->num_stripes;
+-	if (*mirror_num > num_stripes) {
+-		/*
+-		 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
+-		 * that means that the requested area is not left of the left
+-		 * cursor
+-		 */
+-		btrfs_put_bioc(bioc);
+-		return -EIO;
+-	}
+-
+-	/*
+-	 * process the rest of the function using the mirror_num of the source
+-	 * drive. Therefore look it up first.  At the end, patch the device
+-	 * pointer to the one of the target drive.
+-	 */
+-	for (i = 0; i < num_stripes; i++) {
+-		if (bioc->stripes[i].dev->devid != srcdev_devid)
+-			continue;
+-
+-		/*
+-		 * In case of DUP, in order to keep it simple, only add the
+-		 * mirror with the lowest physical address
+-		 */
+-		if (found &&
+-		    physical_of_found <= bioc->stripes[i].physical)
+-			continue;
+-
+-		index_srcdev = i;
+-		found = 1;
+-		physical_of_found = bioc->stripes[i].physical;
+-	}
+-
+-	btrfs_put_bioc(bioc);
+-
+-	ASSERT(found);
+-	if (!found)
+-		return -EIO;
+-
+-	*mirror_num = index_srcdev + 1;
+-	*physical = physical_of_found;
+-	return ret;
+-}
+-
+ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+ {
+ 	struct btrfs_block_group *cache;
+@@ -6198,101 +6086,80 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
+ }
+ 
+ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
+-				      struct btrfs_io_context **bioc_ret,
++				      struct btrfs_io_context *bioc,
+ 				      struct btrfs_dev_replace *dev_replace,
+ 				      u64 logical,
+ 				      int *num_stripes_ret, int *max_errors_ret)
+ {
+-	struct btrfs_io_context *bioc = *bioc_ret;
+ 	u64 srcdev_devid = dev_replace->srcdev->devid;
+-	int tgtdev_indexes = 0;
++	/*
++	 * At this stage, num_stripes is still the real number of stripes,
++	 * excluding the duplicated stripes.
++	 */
+ 	int num_stripes = *num_stripes_ret;
++	int nr_extra_stripes = 0;
+ 	int max_errors = *max_errors_ret;
+ 	int i;
+ 
+-	if (op == BTRFS_MAP_WRITE) {
+-		int index_where_to_add;
++	/*
++	 * A block group which has "to_copy" set will eventually be copied by
++	 * the dev-replace process. We can avoid cloning IO here.
++	 */
++	if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
++		return;
+ 
+-		/*
+-		 * A block group which have "to_copy" set will eventually
+-		 * copied by dev-replace process. We can avoid cloning IO here.
+-		 */
+-		if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
+-			return;
++	/*
++	 * Duplicate the write operations while the dev-replace procedure is
++	 * running. Since the copying of the old disk to the new disk takes
++	 * place at run time while the filesystem is mounted writable, the
++	 * regular write operations to the old disk have to be duplicated to go
++	 * to the new disk as well.
++	 *
++	 * Note that device->missing is handled by the caller, and that the
++	 * write to the old disk is already set up in the stripes array.
++	 */
++	for (i = 0; i < num_stripes; i++) {
++		struct btrfs_io_stripe *old = &bioc->stripes[i];
++		struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
+ 
+-		/*
+-		 * duplicate the write operations while the dev replace
+-		 * procedure is running. Since the copying of the old disk to
+-		 * the new disk takes place at run time while the filesystem is
+-		 * mounted writable, the regular write operations to the old
+-		 * disk have to be duplicated to go to the new disk as well.
+-		 *
+-		 * Note that device->missing is handled by the caller, and that
+-		 * the write to the old disk is already set up in the stripes
+-		 * array.
+-		 */
+-		index_where_to_add = num_stripes;
+-		for (i = 0; i < num_stripes; i++) {
+-			if (bioc->stripes[i].dev->devid == srcdev_devid) {
+-				/* write to new disk, too */
+-				struct btrfs_io_stripe *new =
+-					bioc->stripes + index_where_to_add;
+-				struct btrfs_io_stripe *old =
+-					bioc->stripes + i;
+-
+-				new->physical = old->physical;
+-				new->dev = dev_replace->tgtdev;
+-				bioc->tgtdev_map[i] = index_where_to_add;
+-				index_where_to_add++;
+-				max_errors++;
+-				tgtdev_indexes++;
+-			}
+-		}
+-		num_stripes = index_where_to_add;
+-	} else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
+-		int index_srcdev = 0;
+-		int found = 0;
+-		u64 physical_of_found = 0;
++		if (old->dev->devid != srcdev_devid)
++			continue;
+ 
+-		/*
+-		 * During the dev-replace procedure, the target drive can also
+-		 * be used to read data in case it is needed to repair a corrupt
+-		 * block elsewhere. This is possible if the requested area is
+-		 * left of the left cursor. In this area, the target drive is a
+-		 * full copy of the source drive.
+-		 */
+-		for (i = 0; i < num_stripes; i++) {
+-			if (bioc->stripes[i].dev->devid == srcdev_devid) {
+-				/*
+-				 * In case of DUP, in order to keep it simple,
+-				 * only add the mirror with the lowest physical
+-				 * address
+-				 */
+-				if (found &&
+-				    physical_of_found <= bioc->stripes[i].physical)
+-					continue;
+-				index_srcdev = i;
+-				found = 1;
+-				physical_of_found = bioc->stripes[i].physical;
+-			}
+-		}
+-		if (found) {
+-			struct btrfs_io_stripe *tgtdev_stripe =
+-				bioc->stripes + num_stripes;
++		new->physical = old->physical;
++		new->dev = dev_replace->tgtdev;
++		if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
++			bioc->replace_stripe_src = i;
++		nr_extra_stripes++;
++	}
++
++	/* We can only have at most 2 extra nr_stripes (for DUP). */
++	ASSERT(nr_extra_stripes <= 2);
++	/*
++	 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
++	 * replace.
++	 * If we have 2 extra stripes, only choose the one with smaller physical.
++	 */
++	if (op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
++		struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
++		struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
+ 
+-			tgtdev_stripe->physical = physical_of_found;
+-			tgtdev_stripe->dev = dev_replace->tgtdev;
+-			bioc->tgtdev_map[index_srcdev] = num_stripes;
++		/* Only DUP can have two extra stripes. */
++		ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);
+ 
+-			tgtdev_indexes++;
+-			num_stripes++;
++		/*
++		 * Swap the last stripe stripes and reduce @nr_extra_stripes.
++		 * The extra stripe would still be there, but won't be accessed.
++		 */
++		if (first->physical > second->physical) {
++			swap(second->physical, first->physical);
++			swap(second->dev, first->dev);
++			nr_extra_stripes--;
+ 		}
+ 	}
+ 
+-	*num_stripes_ret = num_stripes;
+-	*max_errors_ret = max_errors;
+-	bioc->num_tgtdevs = tgtdev_indexes;
+-	*bioc_ret = bioc;
++	*num_stripes_ret = num_stripes + nr_extra_stripes;
++	*max_errors_ret = max_errors + nr_extra_stripes;
++	bioc->replace_nr_stripes = nr_extra_stripes;
+ }
+ 
+ static bool need_full_stripe(enum btrfs_map_op op)
+@@ -6301,25 +6168,35 @@ static bool need_full_stripe(enum btrfs_map_op op)
+ }
+ 
+ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
+-			    u64 offset, u64 *stripe_nr, u64 *stripe_offset,
++			    u64 offset, u32 *stripe_nr, u64 *stripe_offset,
+ 			    u64 *full_stripe_start)
+ {
+-	u32 stripe_len = map->stripe_len;
+-
+ 	ASSERT(op != BTRFS_MAP_DISCARD);
+ 
+ 	/*
+ 	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
+ 	 * the offset of this block in its stripe.
+ 	 */
+-	*stripe_nr = div64_u64_rem(offset, stripe_len, stripe_offset);
++	*stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
++	*stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ 	ASSERT(*stripe_offset < U32_MAX);
+ 
+ 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+-		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
++		unsigned long full_stripe_len = nr_data_stripes(map) <<
++						BTRFS_STRIPE_LEN_SHIFT;
+ 
++		/*
++		 * For full stripe start, we use previously calculated
++		 * @stripe_nr. Align it to nr_data_stripes, then multiply with
++		 * STRIPE_LEN.
++		 *
++		 * By this we can avoid u64 division completely.  And we have
++		 * to go rounddown(), not round_down(), as nr_data_stripes is
++		 * not ensured to be power of 2.
++		 */
+ 		*full_stripe_start =
+-			div64_u64(offset, full_stripe_len) * full_stripe_len;
++			rounddown(*stripe_nr, nr_data_stripes(map)) <<
++			BTRFS_STRIPE_LEN_SHIFT;
+ 
+ 		/*
+ 		 * For writes to RAID56, allow to write a full stripe set, but
+@@ -6334,16 +6211,16 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
+ 	 * a single disk).
+ 	 */
+ 	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
+-		return stripe_len - *stripe_offset;
++		return BTRFS_STRIPE_LEN - *stripe_offset;
+ 	return U64_MAX;
+ }
+ 
+ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map,
+-		          u32 stripe_index, u64 stripe_offset, u64 stripe_nr)
++			  u32 stripe_index, u64 stripe_offset, u32 stripe_nr)
+ {
+ 	dst->dev = map->stripes[stripe_index].dev;
+ 	dst->physical = map->stripes[stripe_index].physical +
+-			stripe_offset + stripe_nr * map->stripe_len;
++			stripe_offset + (stripe_nr << BTRFS_STRIPE_LEN_SHIFT);
+ }
+ 
+ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+@@ -6356,35 +6233,35 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	struct map_lookup *map;
+ 	u64 map_offset;
+ 	u64 stripe_offset;
+-	u64 stripe_nr;
+-	u64 stripe_len;
++	u32 stripe_nr;
+ 	u32 stripe_index;
+ 	int data_stripes;
+ 	int i;
+ 	int ret = 0;
+ 	int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ 	int num_stripes;
++	int num_copies;
+ 	int max_errors = 0;
+-	int tgtdev_indexes = 0;
+ 	struct btrfs_io_context *bioc = NULL;
+ 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+ 	int dev_replace_is_ongoing = 0;
+-	int num_alloc_stripes;
+-	int patch_the_first_stripe_for_dev_replace = 0;
+-	u64 physical_to_patch_in_first_stripe = 0;
++	u16 num_alloc_stripes;
+ 	u64 raid56_full_stripe_start = (u64)-1;
+ 	u64 max_len;
+ 
+ 	ASSERT(bioc_ret);
+ 	ASSERT(op != BTRFS_MAP_DISCARD);
+ 
++	num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
++	if (mirror_num > num_copies)
++		return -EINVAL;
++
+ 	em = btrfs_get_chunk_map(fs_info, logical, *length);
+ 	if (IS_ERR(em))
+ 		return PTR_ERR(em);
+ 
+ 	map = em->map_lookup;
+ 	data_stripes = nr_data_stripes(map);
+-	stripe_len = map->stripe_len;
+ 
+ 	map_offset = logical - em->start;
+ 	max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
+@@ -6400,25 +6277,11 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	if (!dev_replace_is_ongoing)
+ 		up_read(&dev_replace->rwsem);
+ 
+-	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+-	    !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
+-		ret = get_extra_mirror_from_replace(fs_info, logical, *length,
+-						    dev_replace->srcdev->devid,
+-						    &mirror_num,
+-					    &physical_to_patch_in_first_stripe);
+-		if (ret)
+-			goto out;
+-		else
+-			patch_the_first_stripe_for_dev_replace = 1;
+-	} else if (mirror_num > map->num_stripes) {
+-		mirror_num = 0;
+-	}
+-
+ 	num_stripes = 1;
+ 	stripe_index = 0;
+ 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+-				&stripe_index);
++		stripe_index = stripe_nr % map->num_stripes;
++		stripe_nr /= map->num_stripes;
+ 		if (!need_full_stripe(op))
+ 			mirror_num = 1;
+ 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
+@@ -6444,8 +6307,8 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+ 		u32 factor = map->num_stripes / map->sub_stripes;
+ 
+-		stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
+-		stripe_index *= map->sub_stripes;
++		stripe_index = (stripe_nr % factor) * map->sub_stripes;
++		stripe_nr /= factor;
+ 
+ 		if (need_full_stripe(op))
+ 			num_stripes = map->sub_stripes;
+@@ -6460,11 +6323,17 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 		}
+ 
+ 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+-		ASSERT(map->stripe_len == BTRFS_STRIPE_LEN);
+ 		if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
+-			/* push stripe_nr back to the start of the full stripe */
+-			stripe_nr = div64_u64(raid56_full_stripe_start,
+-					stripe_len * data_stripes);
++			/*
++			 * Push stripe_nr back to the start of the full stripe
++			 * For those cases needing a full stripe, @stripe_nr
++			 * is the full stripe number.
++			 *
++			 * Originally we go raid56_full_stripe_start / full_stripe_len,
++			 * but that can be expensive.  Here we just divide
++			 * @stripe_nr with @data_stripes.
++			 */
++			stripe_nr /= data_stripes;
+ 
+ 			/* RAID[56] write or recovery. Return all stripes */
+ 			num_stripes = map->num_stripes;
+@@ -6473,7 +6342,7 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 			/* Return the length to the full stripe end */
+ 			*length = min(logical + *length,
+ 				      raid56_full_stripe_start + em->start +
+-				      data_stripes * stripe_len) - logical;
++				      (data_stripes << BTRFS_STRIPE_LEN_SHIFT)) - logical;
+ 			stripe_index = 0;
+ 			stripe_offset = 0;
+ 		} else {
+@@ -6482,25 +6351,24 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 			 * Mirror #2 is RAID5 parity block.
+ 			 * Mirror #3 is RAID6 Q block.
+ 			 */
+-			stripe_nr = div_u64_rem(stripe_nr,
+-					data_stripes, &stripe_index);
++			stripe_index = stripe_nr % data_stripes;
++			stripe_nr /= data_stripes;
+ 			if (mirror_num > 1)
+ 				stripe_index = data_stripes + mirror_num - 2;
+ 
+ 			/* We distribute the parity blocks across stripes */
+-			div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
+-					&stripe_index);
++			stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
+ 			if (!need_full_stripe(op) && mirror_num <= 1)
+ 				mirror_num = 1;
+ 		}
+ 	} else {
+ 		/*
+-		 * after this, stripe_nr is the number of stripes on this
++		 * After this, stripe_nr is the number of stripes on this
+ 		 * device we have to walk to find the data, and stripe_index is
+ 		 * the number of our device in the stripe array
+ 		 */
+-		stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
+-				&stripe_index);
++		stripe_index = stripe_nr % map->num_stripes;
++		stripe_nr /= map->num_stripes;
+ 		mirror_num = stripe_index + 1;
+ 	}
+ 	if (stripe_index >= map->num_stripes) {
+@@ -6512,13 +6380,16 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	}
+ 
+ 	num_alloc_stripes = num_stripes;
+-	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
+-		if (op == BTRFS_MAP_WRITE)
+-			num_alloc_stripes <<= 1;
+-		if (op == BTRFS_MAP_GET_READ_MIRRORS)
+-			num_alloc_stripes++;
+-		tgtdev_indexes = num_stripes;
+-	}
++	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
++	    op != BTRFS_MAP_READ)
++		/*
++		 * For replace case, we need to add extra stripes for extra
++		 * duplicated stripes.
++		 *
++		 * For both WRITE and GET_READ_MIRRORS, we may have at most
++		 * 2 more stripes (DUP types, otherwise 1).
++		 */
++		num_alloc_stripes += 2;
+ 
+ 	/*
+ 	 * If this I/O maps to a single device, try to return the device and
+@@ -6529,53 +6400,53 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 	    !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) &&
+ 	    (!need_full_stripe(op) || !dev_replace_is_ongoing ||
+ 	     !dev_replace->tgtdev)) {
+-		if (patch_the_first_stripe_for_dev_replace) {
+-			smap->dev = dev_replace->tgtdev;
+-			smap->physical = physical_to_patch_in_first_stripe;
+-			*mirror_num_ret = map->num_stripes + 1;
+-		} else {
+-			set_io_stripe(smap, map, stripe_index, stripe_offset,
+-				      stripe_nr);
+-			*mirror_num_ret = mirror_num;
+-		}
++		set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr);
++		*mirror_num_ret = mirror_num;
+ 		*bioc_ret = NULL;
+ 		ret = 0;
+ 		goto out;
+ 	}
+ 
+-	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes);
++	bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes);
+ 	if (!bioc) {
+ 		ret = -ENOMEM;
+ 		goto out;
+ 	}
++	bioc->map_type = map->type;
+ 
+-	for (i = 0; i < num_stripes; i++) {
+-		set_io_stripe(&bioc->stripes[i], map, stripe_index, stripe_offset,
+-			      stripe_nr);
+-		stripe_index++;
+-	}
+-
+-	/* Build raid_map */
++	/*
++	 * For RAID56 full map, we need to make sure the stripes[] follows the
++	 * rule that data stripes are all ordered, then followed with P and Q
++	 * (if we have).
++	 *
++	 * It's still mostly the same as other profiles, just with extra rotation.
++	 */
+ 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
+ 	    (need_full_stripe(op) || mirror_num > 1)) {
+-		u64 tmp;
+-		unsigned rot;
+-
+-		/* Work out the disk rotation on this stripe-set */
+-		div_u64_rem(stripe_nr, num_stripes, &rot);
+-
+-		/* Fill in the logical address of each stripe */
+-		tmp = stripe_nr * data_stripes;
+-		for (i = 0; i < data_stripes; i++)
+-			bioc->raid_map[(i + rot) % num_stripes] =
+-				em->start + (tmp + i) * map->stripe_len;
+-
+-		bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE;
+-		if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+-			bioc->raid_map[(i + rot + 1) % num_stripes] =
+-				RAID6_Q_STRIPE;
+-
+-		sort_parity_stripes(bioc, num_stripes);
++		/*
++		 * For RAID56 @stripe_nr is already the number of full stripes
++		 * before us, which is also the rotation value (needs to modulo
++		 * with num_stripes).
++		 *
++		 * In this case, we just add @stripe_nr with @i, then do the
++		 * modulo, to reduce one modulo call.
++		 */
++		bioc->full_stripe_logical = em->start +
++			((stripe_nr * data_stripes) << BTRFS_STRIPE_LEN_SHIFT);
++		for (i = 0; i < num_stripes; i++)
++			set_io_stripe(&bioc->stripes[i], map,
++				      (i + stripe_nr) % num_stripes,
++				      stripe_offset, stripe_nr);
++	} else {
++		/*
++		 * For all other non-RAID56 profiles, just copy the target
++		 * stripe into the bioc.
++		 */
++		for (i = 0; i < num_stripes; i++) {
++			set_io_stripe(&bioc->stripes[i], map, stripe_index,
++				      stripe_offset, stripe_nr);
++			stripe_index++;
++		}
+ 	}
+ 
+ 	if (need_full_stripe(op))
+@@ -6583,27 +6454,15 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 
+ 	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
+ 	    need_full_stripe(op)) {
+-		handle_ops_on_dev_replace(op, &bioc, dev_replace, logical,
++		handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
+ 					  &num_stripes, &max_errors);
+ 	}
+ 
+ 	*bioc_ret = bioc;
+-	bioc->map_type = map->type;
+ 	bioc->num_stripes = num_stripes;
+ 	bioc->max_errors = max_errors;
+ 	bioc->mirror_num = mirror_num;
+ 
+-	/*
+-	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
+-	 * mirror_num == num_stripes + 1 && dev_replace target drive is
+-	 * available as a mirror
+-	 */
+-	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+-		WARN_ON(num_stripes > 1);
+-		bioc->stripes[0].dev = dev_replace->tgtdev;
+-		bioc->stripes[0].physical = physical_to_patch_in_first_stripe;
+-		bioc->mirror_num = map->num_stripes + 1;
+-	}
+ out:
+ 	if (dev_replace_is_ongoing) {
+ 		lockdep_assert_held(&dev_replace->rwsem);
+@@ -6941,7 +6800,6 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
+ 	map->num_stripes = num_stripes;
+ 	map->io_width = btrfs_chunk_io_width(leaf, chunk);
+ 	map->io_align = btrfs_chunk_io_align(leaf, chunk);
+-	map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+ 	map->type = type;
+ 	/*
+ 	 * We can't use the sub_stripes value, as for profiles other than
+@@ -8161,3 +8019,76 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
+ 
+ 	return true;
+ }
++
++static void map_raid56_repair_block(struct btrfs_io_context *bioc,
++				    struct btrfs_io_stripe *smap,
++				    u64 logical)
++{
++	int data_stripes = nr_bioc_data_stripes(bioc);
++	int i;
++
++	for (i = 0; i < data_stripes; i++) {
++		u64 stripe_start = bioc->full_stripe_logical +
++				   (i << BTRFS_STRIPE_LEN_SHIFT);
++
++		if (logical >= stripe_start &&
++		    logical < stripe_start + BTRFS_STRIPE_LEN)
++			break;
++	}
++	ASSERT(i < data_stripes);
++	smap->dev = bioc->stripes[i].dev;
++	smap->physical = bioc->stripes[i].physical +
++			((logical - bioc->full_stripe_logical) &
++			 BTRFS_STRIPE_LEN_MASK);
++}
++
++/*
++ * Map a repair write into a single device.
++ *
++ * A repair write is triggered by read time repair or scrub, which would only
++ * update the contents of a single device.
++ * Not update any other mirrors nor go through RMW path.
++ *
++ * Callers should ensure:
++ *
++ * - Call btrfs_bio_counter_inc_blocked() first
++ * - The range does not cross stripe boundary
++ * - Has a valid @mirror_num passed in.
++ */
++int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
++			   struct btrfs_io_stripe *smap, u64 logical,
++			   u32 length, int mirror_num)
++{
++	struct btrfs_io_context *bioc = NULL;
++	u64 map_length = length;
++	int mirror_ret = mirror_num;
++	int ret;
++
++	ASSERT(mirror_num > 0);
++
++	ret = __btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
++				&bioc, smap, &mirror_ret, true);
++	if (ret < 0)
++		return ret;
++
++	/* The map range should not cross stripe boundary. */
++	ASSERT(map_length >= length);
++
++	/* Already mapped to single stripe. */
++	if (!bioc)
++		goto out;
++
++	/* Map the RAID56 multi-stripe writes to a single one. */
++	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
++		map_raid56_repair_block(bioc, smap, logical);
++		goto out;
++	}
++
++	ASSERT(mirror_num <= bioc->num_stripes);
++	smap->dev = bioc->stripes[mirror_num - 1].dev;
++	smap->physical = bioc->stripes[mirror_num - 1].physical;
++out:
++	btrfs_put_bioc(bioc);
++	ASSERT(smap->dev);
++	return 0;
++}
+diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
+index 7e51f2238f72..bf47a1a70813 100644
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -17,7 +17,11 @@
+ 
+ extern struct mutex uuid_mutex;
+ 
+-#define BTRFS_STRIPE_LEN	SZ_64K
++#define BTRFS_STRIPE_LEN		SZ_64K
++#define BTRFS_STRIPE_LEN_SHIFT		(16)
++#define BTRFS_STRIPE_LEN_MASK		(BTRFS_STRIPE_LEN - 1)
++
++static_assert(const_ilog2(BTRFS_STRIPE_LEN) == BTRFS_STRIPE_LEN_SHIFT);
+ 
+ /* Used by sanity check for btrfs_raid_types. */
+ #define const_ffs(n) (__builtin_ctzll(n) + 1)
+@@ -404,17 +408,74 @@ struct btrfs_io_context {
+ 	u64 map_type; /* get from map_lookup->type */
+ 	struct bio *orig_bio;
+ 	atomic_t error;
+-	int max_errors;
+-	int num_stripes;
+-	int mirror_num;
+-	int num_tgtdevs;
+-	int *tgtdev_map;
++	u16 max_errors;
++
++	/*
++	 * The total number of stripes, including the extra duplicated
++	 * stripe for replace.
++	 */
++	u16 num_stripes;
++
++	/*
++	 * The mirror_num of this bioc.
++	 *
++	 * This is for reads which use 0 as mirror_num, thus we should return a
++	 * valid mirror_num (>0) for the reader.
++	 */
++	u16 mirror_num;
++
++	/*
++	 * The following two members are for dev-replace case only.
++	 *
++	 * @replace_nr_stripes:	Number of duplicated stripes which need to be
++	 *			written to replace target.
++	 *			Should be <= 2 (2 for DUP, otherwise <= 1).
++	 * @replace_stripe_src:	The array indicates where the duplicated stripes
++	 *			are from.
++	 *
++	 * The @replace_stripe_src[] array is mostly for RAID56 cases.
++	 * As non-RAID56 stripes share the same contents of the mapped range,
++	 * thus no need to bother where the duplicated ones are from.
++	 *
++	 * But for RAID56 case, all stripes contain different contents, thus
++	 * we need a way to know the mapping.
++	 *
++	 * There is an example for the two members, using a RAID5 write:
++	 *
++	 *   num_stripes:	4 (3 + 1 duplicated write)
++	 *   stripes[0]:	dev = devid 1, physical = X
++	 *   stripes[1]:	dev = devid 2, physical = Y
++	 *   stripes[2]:	dev = devid 3, physical = Z
++	 *   stripes[3]:	dev = devid 0, physical = Y
++	 *
++	 * replace_nr_stripes = 1
++	 * replace_stripe_src = 1	<- Means stripes[1] is involved in replace.
++	 *				   The duplicated stripe index would be
++	 *				   (@num_stripes - 1).
++	 *
++	 * Note, that we can still have cases replace_nr_stripes = 2 for DUP.
++	 * In that case, all stripes share the same content, thus we don't
++	 * need to bother @replace_stripe_src value at all.
++	 */
++	u16 replace_nr_stripes;
++	s16 replace_stripe_src;
+ 	/*
+-	 * logical block numbers for the start of each stripe
+-	 * The last one or two are p/q.  These are sorted,
+-	 * so raid_map[0] is the start of our full stripe
++	 * Logical bytenr of the full stripe start, only for RAID56 cases.
++	 *
++	 * When this value is set to other than (u64)-1, the stripes[] should
++	 * follow this pattern:
++	 *
++	 * (real_stripes = num_stripes - replace_nr_stripes)
++	 * (data_stripes = (is_raid6) ? (real_stripes - 2) : (real_stripes - 1))
++	 *
++	 * stripes[0]:			The first data stripe
++	 * stripes[1]:			The second data stripe
++	 * ...
++	 * stripes[data_stripes - 1]:	The last data stripe
++	 * stripes[data_stripes]:	The P stripe
++	 * stripes[data_stripes + 1]:	The Q stripe (only for RAID6).
+ 	 */
+-	u64 *raid_map;
++	u64 full_stripe_logical;
+ 	struct btrfs_io_stripe stripes[];
+ };
+ 
+@@ -446,7 +507,6 @@ struct map_lookup {
+ 	u64 type;
+ 	int io_align;
+ 	int io_width;
+-	u32 stripe_len;
+ 	int num_stripes;
+ 	int sub_stripes;
+ 	int verified_stripes; /* For mount time dev extent verification */
+@@ -527,6 +587,9 @@ int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
+ 		      struct btrfs_io_context **bioc_ret,
+ 		      struct btrfs_io_stripe *smap, int *mirror_num_ret,
+ 		      int need_raid_map);
++int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
++			   struct btrfs_io_stripe *smap, u64 logical,
++			   u32 length, int mirror_num);
+ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
+ 					       u64 logical, u64 *length_ret,
+ 					       u32 *num_stripes);
+diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
+index da7bb9187b68..8acb05e176c5 100644
+--- a/fs/btrfs/zlib.c
++++ b/fs/btrfs/zlib.c
+@@ -350,8 +350,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 	zlib_inflateEnd(&workspace->strm);
+ 	if (data_in)
+ 		kunmap_local(data_in);
+-	if (!ret)
+-		zero_fill_bio(cb->orig_bio);
+ 	return ret;
+ }
+ 
+diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
+index 45d04092f2f8..a9b32ba6b2ce 100644
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -1640,14 +1640,14 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
+ {
+ 	u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
+ 	struct btrfs_inode *inode = bbio->inode;
+-	struct btrfs_fs_info *fs_info = inode->root->fs_info;
++	struct btrfs_fs_info *fs_info = bbio->fs_info;
+ 	struct btrfs_block_group *cache;
+ 	bool ret = false;
+ 
+ 	if (!btrfs_is_zoned(fs_info))
+ 		return false;
+ 
+-	if (!is_data_inode(&inode->vfs_inode))
++	if (!inode || !is_data_inode(&inode->vfs_inode))
+ 		return false;
+ 
+ 	if (btrfs_op(&bbio->bio) != BTRFS_MAP_WRITE)
+diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
+index e34f1ab99d56..f798da267590 100644
+--- a/fs/btrfs/zstd.c
++++ b/fs/btrfs/zstd.c
+@@ -609,7 +609,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
+ 		}
+ 	}
+ 	ret = 0;
+-	zero_fill_bio(cb->orig_bio);
+ done:
+ 	if (workspace->in_buf.src)
+ 		kunmap_local(workspace->in_buf.src);
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index d766be7152e1..b3e7529ff55e 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -500,6 +500,7 @@ void bio_associate_blkg(struct bio *bio);
+ void bio_associate_blkg_from_css(struct bio *bio,
+ 				 struct cgroup_subsys_state *css);
+ void bio_clone_blkg_association(struct bio *dst, struct bio *src);
++void blkcg_punt_bio_submit(struct bio *bio);
+ #else	/* CONFIG_BLK_CGROUP */
+ static inline void bio_associate_blkg(struct bio *bio) { }
+ static inline void bio_associate_blkg_from_css(struct bio *bio,
+@@ -507,6 +508,10 @@ static inline void bio_associate_blkg_from_css(struct bio *bio,
+ { }
+ static inline void bio_clone_blkg_association(struct bio *dst,
+ 					      struct bio *src) { }
++static inline void blkcg_punt_bio_submit(struct bio *bio)
++{
++	submit_bio(bio);
++}
+ #endif	/* CONFIG_BLK_CGROUP */
+ 
+ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev)
+diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
+index 99be590f952f..fb8843990d28 100644
+--- a/include/linux/blk_types.h
++++ b/include/linux/blk_types.h
+@@ -404,18 +404,11 @@ enum req_flag_bits {
+ 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
+ 	__REQ_BACKGROUND,	/* background IO */
+ 	__REQ_NOWAIT,           /* Don't wait if request will block */
+-	/*
+-	 * When a shared kthread needs to issue a bio for a cgroup, doing
+-	 * so synchronously can lead to priority inversions as the kthread
+-	 * can be trapped waiting for that cgroup.  CGROUP_PUNT flag makes
+-	 * submit_bio() punt the actual issuing to a dedicated per-blkcg
+-	 * work item to avoid such priority inversions.
+-	 */
+-	__REQ_CGROUP_PUNT,
+ 	__REQ_POLLED,		/* caller polls for completion using bio_poll */
+ 	__REQ_ALLOC_CACHE,	/* allocate IO from cache if available */
+ 	__REQ_SWAP,		/* swap I/O */
+ 	__REQ_DRV,		/* for driver use */
++	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
+ 
+ 	/*
+ 	 * Command specific flags, keep last:
+@@ -443,14 +436,13 @@ enum req_flag_bits {
+ #define REQ_RAHEAD	(__force blk_opf_t)(1ULL << __REQ_RAHEAD)
+ #define REQ_BACKGROUND	(__force blk_opf_t)(1ULL << __REQ_BACKGROUND)
+ #define REQ_NOWAIT	(__force blk_opf_t)(1ULL << __REQ_NOWAIT)
+-#define REQ_CGROUP_PUNT	(__force blk_opf_t)(1ULL << __REQ_CGROUP_PUNT)
+-
+-#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
+ #define REQ_POLLED	(__force blk_opf_t)(1ULL << __REQ_POLLED)
+ #define REQ_ALLOC_CACHE	(__force blk_opf_t)(1ULL << __REQ_ALLOC_CACHE)
+-
+-#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
+ #define REQ_SWAP	(__force blk_opf_t)(1ULL << __REQ_SWAP)
++#define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
++#define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
++
++#define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
+ 
+ #define REQ_FAILFAST_MASK \
+ 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
+diff --git a/include/linux/crc32c.h b/include/linux/crc32c.h
+index bd21af828ff6..357ae4611a45 100644
+--- a/include/linux/crc32c.h
++++ b/include/linux/crc32c.h
+@@ -5,7 +5,6 @@
+ #include <linux/types.h>
+ 
+ extern u32 crc32c(u32 crc, const void *address, unsigned int length);
+-extern const char *crc32c_impl(void);
+ 
+ /* This macro exists for backwards-compatibility. */
+ #define crc32c_le crc32c
+diff --git a/include/linux/writeback.h b/include/linux/writeback.h
+index 46020373e155..fba937999fbf 100644
+--- a/include/linux/writeback.h
++++ b/include/linux/writeback.h
+@@ -70,8 +70,6 @@ struct writeback_control {
+ 	 */
+ 	unsigned no_cgroup_owner:1;
+ 
+-	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */
+-
+ 	/* To enable batching of swap writes to non-block-device backends,
+ 	 * "plug" can be set point to a 'struct swap_iocb *'.  When all swap
+ 	 * writes have been submitted, if with swap_iocb is not NULL,
+@@ -97,9 +95,6 @@ static inline blk_opf_t wbc_to_write_flags(struct writeback_control *wbc)
+ {
+ 	blk_opf_t flags = 0;
+ 
+-	if (wbc->punt_to_cgroup)
+-		flags = REQ_CGROUP_PUNT;
+-
+ 	if (wbc->sync_mode == WB_SYNC_ALL)
+ 		flags |= REQ_SYNC;
+ 	else if (wbc->for_kupdate || wbc->for_background)
+diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
+index 75d7d22c3a27..8ea9cea9bfeb 100644
+--- a/include/trace/events/btrfs.h
++++ b/include/trace/events/btrfs.h
+@@ -2422,7 +2422,7 @@ DECLARE_EVENT_CLASS(btrfs_raid56_bio,
+ 	),
+ 
+ 	TP_fast_assign_btrfs(rbio->bioc->fs_info,
+-		__entry->full_stripe	= rbio->bioc->raid_map[0];
++		__entry->full_stripe	= rbio->bioc->full_stripe_logical;
+ 		__entry->physical	= bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ 		__entry->len		= bio->bi_iter.bi_size;
+ 		__entry->opf		= bio_op(bio);
+diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
+index ada0a489bf2b..dbb8b96da50d 100644
+--- a/include/uapi/linux/btrfs.h
++++ b/include/uapi/linux/btrfs.h
+@@ -187,6 +187,7 @@ struct btrfs_scrub_progress {
+ };
+ 
+ #define BTRFS_SCRUB_READONLY	1
++#define BTRFS_SCRUB_SUPPORTED_FLAGS	(BTRFS_SCRUB_READONLY)
+ struct btrfs_ioctl_scrub_args {
+ 	__u64 devid;				/* in */
+ 	__u64 start;				/* in */
+diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c
+index 5ca0d815a95d..649e687413a0 100644
+--- a/lib/libcrc32c.c
++++ b/lib/libcrc32c.c
+@@ -65,12 +65,6 @@ static void __exit libcrc32c_mod_fini(void)
+ 	crypto_free_shash(tfm);
+ }
+ 
+-const char *crc32c_impl(void)
+-{
+-	return crypto_shash_driver_name(tfm);
+-}
+-EXPORT_SYMBOL(crc32c_impl);
+-
+ module_init(libcrc32c_mod_init);
+ module_exit(libcrc32c_mod_fini);
+ 
+diff --git a/tools/objtool/check.c b/tools/objtool/check.c
+index f937be1afe65..060032cfb046 100644
+--- a/tools/objtool/check.c
++++ b/tools/objtool/check.c
+@@ -202,6 +202,7 @@ static bool __dead_end_function(struct objtool_file *file, struct symbol *func,
+ 		"__reiserfs_panic",
+ 		"__stack_chk_fail",
+ 		"__ubsan_handle_builtin_unreachable",
++		"btrfs_assertfail",
+ 		"cpu_bringup_and_idle",
+ 		"cpu_startup_entry",
+ 		"do_exit",
+-- 
+2.40.1
+
+From 0ad50219edceae27eb649c5fb76f2b8aebe27e3f Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 17 Apr 2023 18:32:06 +0200
-Subject: [PATCH 5/8] Implement amd-pstate guided driver
+Subject: [PATCH 06/10] Implement amd-pstate guided driver
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -9873,10 +24825,10 @@ index f5f22418e64b..c10ebf8c42e6 100644
 -- 
 2.40.1
 
-From bf906393dd0d9e24858f3cfd6a9a5d890817cbf6 Mon Sep 17 00:00:00 2001
+From 3162c47812c5d8dac222403897b3c8f424648c6e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 17 Apr 2023 18:28:52 +0200
-Subject: [PATCH 6/8] ksm
+Subject: [PATCH 07/10] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -10373,10 +25325,1492 @@ index 340125d08c03..36e756355f04 100644
 -- 
 2.40.1
 
-From 2f73f41267f19f290a306fde77bc648cc321f8d6 Mon Sep 17 00:00:00 2001
+From 26780b606ac659096b0e1a9a2bba12aa747cbf66 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 28 Apr 2023 20:00:54 +0200
+Subject: [PATCH 08/10] Per-VMA locks
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Previous versions:
+v3: https://lore.kernel.org/all/20230216051750.3125598-1-surenb@google.com/
+v2: https://lore.kernel.org/lkml/20230127194110.533103-1-surenb@google.com/
+v1: https://lore.kernel.org/all/20230109205336.3665937-1-surenb@google.com/
+RFC: https://lore.kernel.org/all/20220901173516.702122-1-surenb@google.com/
+
+LWN article describing the feature:
+https://lwn.net/Articles/906852/
+
+Per-vma locks idea that was discussed during SPF [1] discussion at LSF/MM
+last year [2], which concluded with suggestion that “a reader/writer
+semaphore could be put into the VMA itself; that would have the effect of
+using the VMA as a sort of range lock. There would still be contention at
+the VMA level, but it would be an improvement.” This patchset implements
+this suggested approach.
+
+When handling page faults we lookup the VMA that contains the faulting
+page under RCU protection and try to acquire its lock. If that fails we
+fall back to using mmap_lock, similar to how SPF handled this situation.
+
+One notable way the implementation deviates from the proposal is the way
+VMAs are read-locked. During some of mm updates, multiple VMAs need to be
+locked until the end of the update (e.g. vma_merge, split_vma, etc).
+Tracking all the locked VMAs, avoiding recursive locks, figuring out when
+it's safe to unlock previously locked VMAs would make the code more
+complex. So, instead of the usual lock/unlock pattern, the proposed
+solution marks a VMA as locked and provides an efficient way to:
+1. Identify locked VMAs.
+2. Unlock all locked VMAs in bulk.
+We also postpone unlocking the locked VMAs until the end of the update,
+when we do mmap_write_unlock. Potentially this keeps a VMA locked for
+longer than is absolutely necessary but it results in a big reduction of
+code complexity.
+Read-locking a VMA is done using two sequence numbers - one in the
+vm_area_struct and one in the mm_struct. VMA is considered read-locked
+when these sequence numbers are equal. To read-lock a VMA we set the
+sequence number in vm_area_struct to be equal to the sequence number in
+mm_struct. To unlock all VMAs we increment mm_struct's seq number. This
+allows for an efficient way to track locked VMAs and to drop the locks on
+all VMAs at the end of the update.
+
+The patchset implements per-VMA locking only for anonymous pages which
+are not in swap and avoids userfaultfs as their implementation is more
+complex. Additional support for file-back page faults, swapped and user
+pages can be added incrementally.
+
+Performance benchmarks show similar although slightly smaller benefits as
+with SPF patchset (~75% of SPF benefits). Still, with lower complexity
+this approach might be more desirable.
+
+Since RFC was posted in September 2022, two separate Google teams outside
+of Android evaluated the patchset and confirmed positive results. Here are
+the known usecases when per-VMA locks show benefits:
+
+Android:
+Apps with high number of threads (~100) launch times improve by up to 20%.
+Each thread mmaps several areas upon startup (Stack and Thread-local
+storage (TLS), thread signal stack, indirect ref table), which requires
+taking mmap_lock in write mode. Page faults take mmap_lock in read mode.
+During app launch, both thread creation and page faults establishing the
+active workinget are happening in parallel and that causes lock contention
+between mm writers and readers even if updates and page faults are
+happening in different VMAs. Per-vma locks prevent this contention by
+providing more granular lock.
+
+Google Fibers:
+We have several dynamically sized thread pools that spawn new threads
+under increased load and reduce their number when idling. For example,
+Google's in-process scheduling/threading framework, UMCG/Fibers, is backed
+by such a thread pool. When idling, only a small number of idle worker
+threads are available; when a spike of incoming requests arrive, each
+request is handled in its own "fiber", which is a work item posted onto a
+UMCG worker thread; quite often these spikes lead to a number of new
+threads spawning. Each new thread needs to allocate and register an RSEQ
+section on its TLS, then register itself with the kernel as a UMCG worker
+thread, and only after that it can be considered by the in-process
+UMCG/Fiber scheduler as available to do useful work. In short, during an
+incoming workload spike new threads have to be spawned, and they perform
+several syscalls (RSEQ registration, UMCG worker registration, memory
+allocations) before they can actually start doing useful work. Removing
+any bottlenecks on this thread startup path will greatly improve our
+services' latencies when faced with request/workload spikes.
+At high scale, mmap_lock contention during thread creation and stack page
+faults leads to user-visible multi-second serving latencies in a similar
+pattern to Android app startup. Per-VMA locking patchset has been run
+successfully in limited experiments with user-facing production workloads.
+In these experiments, we observed that the peak thread creation rate was
+high enough that thread creation is no longer a bottleneck.
+
+TCP zerocopy receive:
+From the point of view of TCP zerocopy receive, the per-vma lock patch is
+massively beneficial.
+In today's implementation, a process with N threads where N - 1 are
+performing zerocopy receive and 1 thread is performing madvise() with the
+write lock taken (e.g. needs to change vm_flags) will result in all N -1
+receive threads blocking until the madvise is done. Conversely, on a busy
+process receiving a lot of data, an madvise operation that does need to
+take the mmap lock in write mode will need to wait for all of the receives
+to be done - a lose:lose proposition. Per-VMA locking _removes_ by
+definition this source of contention entirely.
+There are other benefits for receive as well, chiefly a reduction in
+cacheline bouncing across receiving threads for locking/unlocking the
+single mmap lock. On an RPC style synthetic workload with 4KB RPCs:
+1a) The find+lock+unlock VMA path in the base case, without the per-vma
+lock patchset, is about 0.7% of cycles as measured by perf.
+1b) mmap_read_lock + mmap_read_unlock in the base case is about 0.5%
+cycles overall - most of this is within the TCP read hotpath (a small
+fraction is 'other' usage in the system).
+2a) The find+lock+unlock VMA path, with the per-vma patchset and a trivial
+patch written to take advantage of it in TCP, is about 0.4% of cycles
+(down from 0.7% above)
+2b) mmap_read_lock + mmap_read_unlock in the per-vma patchset is < 0.1%
+cycles and is out of the TCP read hotpath entirely (down from 0.5% before,
+the remaining usage is the 'other' usage in the system).
+So, in addition to entirely removing an onerous source of contention, it
+also reduces the CPU cycles of TCP receive zerocopy by about 0.5%+
+(compared to overall cycles in perf) for the 'small' RPC scenario.
+
+The patchset structure is:
+0001-0008: Enable maple-tree RCU mode
+0009-0031: Main per-vma locks patchset
+0032-0033: Performance optimizations
+
+Changes since v3:
+- Changed patch [3] to move vma_prepare before vma_adjust_trans_huge
+- Dropped patch [4] from the set as unnecessary, per Hyeonggon Yoo
+- Changed patch [5] to do VMA locking inside vma_prepare, per Liam Howlett
+- Dropped patch [6] from the set as unnecessary, per Liam Howlett
+
+[1] https://lore.kernel.org/all/20220128131006.67712-1-michel@lespinasse.org/
+[2] https://lwn.net/Articles/893906/
+[3] https://lore.kernel.org/all/20230216051750.3125598-15-surenb@google.com/
+[4] https://lore.kernel.org/all/20230216051750.3125598-17-surenb@google.com/
+[5] https://lore.kernel.org/all/20230216051750.3125598-18-surenb@google.com/
+[6] https://lore.kernel.org/all/20230216051750.3125598-22-surenb@google.com/
+
+The patchset applies cleanly over mm-unstable branch.
+
+Laurent Dufour (1):
+  powerc/mm: try VMA lock-based page fault handling first
+
+Liam Howlett (4):
+  maple_tree: Be more cautious about dead nodes
+  maple_tree: Detect dead nodes in mas_start()
+  maple_tree: Fix freeing of nodes in rcu mode
+  maple_tree: remove extra smp_wmb() from mas_dead_leaves()
+
+Liam R. Howlett (4):
+  maple_tree: Fix write memory barrier of nodes once dead for RCU mode
+  maple_tree: Add smp_rmb() to dead node detection
+  maple_tree: Add RCU lock checking to rcu callback functions
+  mm: Enable maple tree RCU mode by default.
+
+Michel Lespinasse (1):
+  mm: rcu safe VMA freeing
+
+Suren Baghdasaryan (23):
+  mm: introduce CONFIG_PER_VMA_LOCK
+  mm: move mmap_lock assert function definitions
+  mm: add per-VMA lock and helper functions to control it
+  mm: mark VMA as being written when changing vm_flags
+  mm/mmap: move vma_prepare before vma_adjust_trans_huge
+  mm/khugepaged: write-lock VMA while collapsing a huge page
+  mm/mmap: write-lock VMAs in vma_prepare before modifying them
+  mm/mremap: write-lock VMA while remapping it to a new address range
+  mm: write-lock VMAs before removing them from VMA tree
+  mm: conditionally write-lock VMA in free_pgtables
+  kernel/fork: assert no VMA readers during its destruction
+  mm/mmap: prevent pagefault handler from racing with mmu_notifier
+    registration
+  mm: introduce vma detached flag
+  mm: introduce lock_vma_under_rcu to be used from arch-specific code
+  mm: fall back to mmap_lock if vma->anon_vma is not yet set
+  mm: add FAULT_FLAG_VMA_LOCK flag
+  mm: prevent do_swap_page from handling page faults under VMA lock
+  mm: prevent userfaults to be handled under per-vma lock
+  mm: introduce per-VMA lock statistics
+  x86/mm: try VMA lock-based page fault handling first
+  arm64/mm: try VMA lock-based page fault handling first
+  mm/mmap: free vm_area_struct without call_rcu in exit_mmap
+  mm: separate vma->lock from vm_area_struct
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ arch/arm64/Kconfig                     |   1 +
+ arch/arm64/mm/fault.c                  |  36 +++++++
+ arch/powerpc/mm/fault.c                |  37 +++++++
+ arch/powerpc/platforms/powernv/Kconfig |   1 +
+ arch/powerpc/platforms/pseries/Kconfig |   1 +
+ arch/s390/Kconfig                      |   1 +
+ arch/s390/mm/fault.c                   |  24 +++++
+ arch/x86/Kconfig                       |   1 +
+ arch/x86/mm/fault.c                    |  36 +++++++
+ include/linux/mm.h                     | 127 +++++++++++++++++++++++--
+ include/linux/mm_types.h               |  30 +++++-
+ include/linux/mmap_lock.h              |  37 ++++---
+ include/linux/vm_event_item.h          |   6 ++
+ include/linux/vmstat.h                 |   6 ++
+ kernel/fork.c                          |  96 ++++++++++++++++---
+ mm/Kconfig                             |  12 +++
+ mm/Kconfig.debug                       |   6 ++
+ mm/init-mm.c                           |   3 +
+ mm/internal.h                          |   2 +-
+ mm/khugepaged.c                        |   8 ++
+ mm/memory.c                            |  72 +++++++++++++-
+ mm/mmap.c                              |  48 +++++++---
+ mm/mremap.c                            |   1 +
+ mm/rmap.c                              |  31 +++---
+ mm/vmstat.c                            |   6 ++
+ 25 files changed, 567 insertions(+), 62 deletions(-)
+
+diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
+index 1023e896d46b..6f104c829731 100644
+--- a/arch/arm64/Kconfig
++++ b/arch/arm64/Kconfig
+@@ -95,6 +95,7 @@ config ARM64
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
+ 	select ARCH_SUPPORTS_PAGE_TABLE_CHECK
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
+ 	select ARCH_WANT_DEFAULT_BPF_JIT
+ 	select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
+index f4cb0f85ccf4..9e0db5c387e3 100644
+--- a/arch/arm64/mm/fault.c
++++ b/arch/arm64/mm/fault.c
+@@ -535,6 +535,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 	unsigned long vm_flags;
+ 	unsigned int mm_flags = FAULT_FLAG_DEFAULT;
+ 	unsigned long addr = untagged_addr(far);
++#ifdef CONFIG_PER_VMA_LOCK
++	struct vm_area_struct *vma;
++#endif
+ 
+ 	if (kprobe_page_fault(regs, esr))
+ 		return 0;
+@@ -585,6 +588,36 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 
+ 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(mm_flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++
++	vma = lock_vma_under_rcu(mm, addr);
++	if (!vma)
++		goto lock_mmap;
++
++	if (!(vma->vm_flags & vm_flags)) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, addr & PAGE_MASK,
++				mm_flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		if (!user_mode(regs))
++			goto no_context;
++		return 0;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
+ 	/*
+ 	 * As per x86, we may deadlock here. However, since the kernel only
+ 	 * validly references user space from well defined areas of the code,
+@@ -628,6 +661,9 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
+ 	}
+ 	mmap_read_unlock(mm);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	/*
+ 	 * Handle the "normal" (no error) case first.
+ 	 */
+diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
+index af46aa88422b..531177a4ee08 100644
+--- a/arch/powerpc/mm/fault.c
++++ b/arch/powerpc/mm/fault.c
+@@ -474,6 +474,40 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ 	if (is_exec)
+ 		flags |= FAULT_FLAG_INSTRUCTION;
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
++
++	if (unlikely(access_pkey_error(is_write, is_exec,
++				       (error_code & DSISR_KEYFAULT), vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++
++	if (unlikely(access_error(is_write, is_exec, vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++
++	if (fault_signal_pending(fault, regs))
++		return user_mode(regs) ? 0 : SIGBUS;
++
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ 	/* When running in the kernel we expect faults to occur only to
+ 	 * addresses in user space.  All other faults represent errors in the
+ 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
+@@ -550,6 +584,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
+ 
+ 	mmap_read_unlock(current->mm);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	if (unlikely(fault & VM_FAULT_ERROR))
+ 		return mm_fault_error(regs, address, fault);
+ 
+diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
+index ae248a161b43..70a46acc70d6 100644
+--- a/arch/powerpc/platforms/powernv/Kconfig
++++ b/arch/powerpc/platforms/powernv/Kconfig
+@@ -16,6 +16,7 @@ config PPC_POWERNV
+ 	select PPC_DOORBELL
+ 	select MMU_NOTIFIER
+ 	select FORCE_SMP
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	default y
+ 
+ config OPAL_PRD
+diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig
+index 21b22bf16ce6..4ebf2ef2845d 100644
+--- a/arch/powerpc/platforms/pseries/Kconfig
++++ b/arch/powerpc/platforms/pseries/Kconfig
+@@ -22,6 +22,7 @@ config PPC_PSERIES
+ 	select HOTPLUG_CPU
+ 	select FORCE_SMP
+ 	select SWIOTLB
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	default y
+ 
+ config PARAVIRT
+diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
+index 9809c74e1240..548b5b587003 100644
+--- a/arch/s390/Kconfig
++++ b/arch/s390/Kconfig
+@@ -120,6 +120,7 @@ config S390
+ 	select ARCH_SUPPORTS_DEBUG_PAGEALLOC
+ 	select ARCH_SUPPORTS_HUGETLBFS
+ 	select ARCH_SUPPORTS_NUMA_BALANCING
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_USE_BUILTIN_BSWAP
+ 	select ARCH_USE_CMPXCHG_LOCKREF
+ 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
+diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
+index a2632fd97d00..b65144c392b0 100644
+--- a/arch/s390/mm/fault.c
++++ b/arch/s390/mm/fault.c
+@@ -407,6 +407,30 @@ static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
+ 		access = VM_WRITE;
+ 	if (access == VM_WRITE)
+ 		flags |= FAULT_FLAG_WRITE;
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
++	if (!(vma->vm_flags & access)) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto out;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		fault = VM_FAULT_SIGNAL;
++		goto out;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
+ 	mmap_read_lock(mm);
+ 
+ 	gmap = NULL;
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index a825bf031f49..df21fba77db1 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -27,6 +27,7 @@ config X86_64
+ 	# Options that are inherently 64-bit kernel only:
+ 	select ARCH_HAS_GIGANTIC_PAGE
+ 	select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
++	select ARCH_SUPPORTS_PER_VMA_LOCK
+ 	select ARCH_USE_CMPXCHG_LOCKREF
+ 	select HAVE_ARCH_SOFT_DIRTY
+ 	select MODULES_USE_ELF_RELA
+diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
+index a498ae1fbe66..e4399983c50c 100644
+--- a/arch/x86/mm/fault.c
++++ b/arch/x86/mm/fault.c
+@@ -19,6 +19,7 @@
+ #include <linux/uaccess.h>		/* faulthandler_disabled()	*/
+ #include <linux/efi.h>			/* efi_crash_gracefully_on_page_fault()*/
+ #include <linux/mm_types.h>
++#include <linux/mm.h>			/* find_and_lock_vma() */
+ 
+ #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/
+ #include <asm/traps.h>			/* dotraplinkage, ...		*/
+@@ -1333,6 +1334,38 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	}
+ #endif
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	if (!(flags & FAULT_FLAG_USER))
++		goto lock_mmap;
++
++	vma = lock_vma_under_rcu(mm, address);
++	if (!vma)
++		goto lock_mmap;
++
++	if (unlikely(access_error(error_code, vma))) {
++		vma_end_read(vma);
++		goto lock_mmap;
++	}
++	fault = handle_mm_fault(vma, address, flags | FAULT_FLAG_VMA_LOCK, regs);
++	vma_end_read(vma);
++
++	if (!(fault & VM_FAULT_RETRY)) {
++		count_vm_vma_lock_event(VMA_LOCK_SUCCESS);
++		goto done;
++	}
++	count_vm_vma_lock_event(VMA_LOCK_RETRY);
++
++	/* Quick path to respond to signals */
++	if (fault_signal_pending(fault, regs)) {
++		if (!user_mode(regs))
++			kernelmode_fixup_or_oops(regs, error_code, address,
++						 SIGBUS, BUS_ADRERR,
++						 ARCH_DEFAULT_PKEY);
++		return;
++	}
++lock_mmap:
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ 	/*
+ 	 * Kernel-mode access to the user address space should only occur
+ 	 * on well-defined single instructions listed in the exception
+@@ -1433,6 +1466,9 @@ void do_user_addr_fault(struct pt_regs *regs,
+ 	}
+ 
+ 	mmap_read_unlock(mm);
++#ifdef CONFIG_PER_VMA_LOCK
++done:
++#endif
+ 	if (likely(!(fault & VM_FAULT_ERROR)))
+ 		return;
+ 
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 1f79667824eb..c4c9de7d1916 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -256,6 +256,8 @@ void setup_initial_init_mm(void *start_code, void *end_code,
+ struct vm_area_struct *vm_area_alloc(struct mm_struct *);
+ struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
+ void vm_area_free(struct vm_area_struct *);
++/* Use only if VMA has no other users */
++void __vm_area_free(struct vm_area_struct *vma);
+ 
+ #ifndef CONFIG_MMU
+ extern struct rb_root nommu_region_tree;
+@@ -478,7 +480,8 @@ static inline bool fault_flag_allow_retry_first(enum fault_flag flags)
+ 	{ FAULT_FLAG_USER,		"USER" }, \
+ 	{ FAULT_FLAG_REMOTE,		"REMOTE" }, \
+ 	{ FAULT_FLAG_INSTRUCTION,	"INSTRUCTION" }, \
+-	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }
++	{ FAULT_FLAG_INTERRUPTIBLE,	"INTERRUPTIBLE" }, \
++	{ FAULT_FLAG_VMA_LOCK,		"VMA_LOCK" }
+ 
+ /*
+  * vm_fault is filled by the pagefault handler and passed to the vma's
+@@ -623,6 +626,117 @@ struct vm_operations_struct {
+ 					  unsigned long addr);
+ };
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++/*
++ * Try to read-lock a vma. The function is allowed to occasionally yield false
++ * locked result to avoid performance overhead, in which case we fall back to
++ * using mmap_lock. The function should never yield false unlocked result.
++ */
++static inline bool vma_start_read(struct vm_area_struct *vma)
++{
++	/* Check before locking. A race might cause false locked result. */
++	if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
++		return false;
++
++	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
++		return false;
++
++	/*
++	 * Overflow might produce false locked result.
++	 * False unlocked result is impossible because we modify and check
++	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
++	 * modification invalidates all existing locks.
++	 */
++	if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
++		up_read(&vma->vm_lock->lock);
++		return false;
++	}
++	return true;
++}
++
++static inline void vma_end_read(struct vm_area_struct *vma)
++{
++	rcu_read_lock(); /* keeps vma alive till the end of up_read */
++	up_read(&vma->vm_lock->lock);
++	rcu_read_unlock();
++}
++
++static bool __is_vma_write_locked(struct vm_area_struct *vma, int *mm_lock_seq)
++{
++	mmap_assert_write_locked(vma->vm_mm);
++
++	/*
++	 * current task is holding mmap_write_lock, both vma->vm_lock_seq and
++	 * mm->mm_lock_seq can't be concurrently modified.
++	 */
++	*mm_lock_seq = READ_ONCE(vma->vm_mm->mm_lock_seq);
++	return (vma->vm_lock_seq == *mm_lock_seq);
++}
++
++static inline void vma_start_write(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	if (__is_vma_write_locked(vma, &mm_lock_seq))
++		return;
++
++	down_write(&vma->vm_lock->lock);
++	vma->vm_lock_seq = mm_lock_seq;
++	up_write(&vma->vm_lock->lock);
++}
++
++static inline bool vma_try_start_write(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	if (__is_vma_write_locked(vma, &mm_lock_seq))
++		return true;
++
++	if (!down_write_trylock(&vma->vm_lock->lock))
++		return false;
++
++	vma->vm_lock_seq = mm_lock_seq;
++	up_write(&vma->vm_lock->lock);
++	return true;
++}
++
++static inline void vma_assert_write_locked(struct vm_area_struct *vma)
++{
++	int mm_lock_seq;
++
++	VM_BUG_ON_VMA(!__is_vma_write_locked(vma, &mm_lock_seq), vma);
++}
++
++static inline void vma_mark_detached(struct vm_area_struct *vma, bool detached)
++{
++	/* When detaching vma should be write-locked */
++	if (detached)
++		vma_assert_write_locked(vma);
++	vma->detached = detached;
++}
++
++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
++					  unsigned long address);
++
++#else /* CONFIG_PER_VMA_LOCK */
++
++static inline void vma_init_lock(struct vm_area_struct *vma) {}
++static inline bool vma_start_read(struct vm_area_struct *vma)
++		{ return false; }
++static inline void vma_end_read(struct vm_area_struct *vma) {}
++static inline void vma_start_write(struct vm_area_struct *vma) {}
++static inline bool vma_try_start_write(struct vm_area_struct *vma)
++		{ return true; }
++static inline void vma_assert_write_locked(struct vm_area_struct *vma) {}
++static inline void vma_mark_detached(struct vm_area_struct *vma,
++				     bool detached) {}
++
++#endif /* CONFIG_PER_VMA_LOCK */
++
++/*
++ * WARNING: vma_init does not initialize vma->vm_lock.
++ * Use vm_area_alloc()/vm_area_free() if vma needs locking.
++ */
+ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+ {
+ 	static const struct vm_operations_struct dummy_vm_ops = {};
+@@ -631,6 +745,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
+ 	vma->vm_mm = mm;
+ 	vma->vm_ops = &dummy_vm_ops;
+ 	INIT_LIST_HEAD(&vma->anon_vma_chain);
++	vma_mark_detached(vma, false);
+ }
+ 
+ /* Use when VMA is not part of the VMA tree and needs no locking */
+@@ -644,28 +759,28 @@ static inline void vm_flags_init(struct vm_area_struct *vma,
+ static inline void vm_flags_reset(struct vm_area_struct *vma,
+ 				  vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	vm_flags_init(vma, flags);
+ }
+ 
+ static inline void vm_flags_reset_once(struct vm_area_struct *vma,
+ 				       vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	WRITE_ONCE(ACCESS_PRIVATE(vma, __vm_flags), flags);
+ }
+ 
+ static inline void vm_flags_set(struct vm_area_struct *vma,
+ 				vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	ACCESS_PRIVATE(vma, __vm_flags) |= flags;
+ }
+ 
+ static inline void vm_flags_clear(struct vm_area_struct *vma,
+ 				  vm_flags_t flags)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	ACCESS_PRIVATE(vma, __vm_flags) &= ~flags;
+ }
+ 
+@@ -686,7 +801,7 @@ static inline void __vm_flags_mod(struct vm_area_struct *vma,
+ static inline void vm_flags_mod(struct vm_area_struct *vma,
+ 				vm_flags_t set, vm_flags_t clear)
+ {
+-	mmap_assert_write_locked(vma->vm_mm);
++	vma_start_write(vma);
+ 	__vm_flags_mod(vma, set, clear);
+ }
+ 
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index a57e6ae78e65..ac4b5df9ba56 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -471,6 +471,10 @@ struct anon_vma_name {
+ 	char name[];
+ };
+ 
++struct vma_lock {
++	struct rw_semaphore lock;
++};
++
+ /*
+  * This struct describes a virtual memory area. There is one of these
+  * per VM-area/task. A VM area is any part of the process virtual memory
+@@ -480,9 +484,16 @@ struct anon_vma_name {
+ struct vm_area_struct {
+ 	/* The first cache line has the info for VMA tree walking. */
+ 
+-	unsigned long vm_start;		/* Our start address within vm_mm. */
+-	unsigned long vm_end;		/* The first byte after our end address
+-					   within vm_mm. */
++	union {
++		struct {
++			/* VMA covers [vm_start; vm_end) addresses within mm */
++			unsigned long vm_start;
++			unsigned long vm_end;
++		};
++#ifdef CONFIG_PER_VMA_LOCK
++		struct rcu_head vm_rcu;	/* Used for deferred freeing. */
++#endif
++	};
+ 
+ 	struct mm_struct *vm_mm;	/* The address space we belong to. */
+ 
+@@ -501,6 +512,14 @@ struct vm_area_struct {
+ 		vm_flags_t __private __vm_flags;
+ 	};
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++	int vm_lock_seq;
++	struct vma_lock *vm_lock;
++
++	/* Flag to indicate areas detached from the mm->mm_mt tree */
++	bool detached;
++#endif
++
+ 	/*
+ 	 * For areas with an address space and backing store,
+ 	 * linkage into the address_space->i_mmap interval tree.
+@@ -637,6 +656,9 @@ struct mm_struct {
+ 					  * init_mm.mmlist, and are protected
+ 					  * by mmlist_lock
+ 					  */
++#ifdef CONFIG_PER_VMA_LOCK
++		int mm_lock_seq;
++#endif
+ 
+ 
+ 		unsigned long hiwater_rss; /* High-watermark of RSS usage */
+@@ -1037,6 +1059,7 @@ typedef struct {
+  *                      mapped after the fault.
+  * @FAULT_FLAG_ORIG_PTE_VALID: whether the fault has vmf->orig_pte cached.
+  *                        We should only access orig_pte if this flag set.
++ * @FAULT_FLAG_VMA_LOCK: The fault is handled under VMA lock.
+  *
+  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
+  * whether we would allow page faults to retry by specifying these two
+@@ -1074,6 +1097,7 @@ enum fault_flag {
+ 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+ 	FAULT_FLAG_UNSHARE =		1 << 10,
+ 	FAULT_FLAG_ORIG_PTE_VALID =	1 << 11,
++	FAULT_FLAG_VMA_LOCK =		1 << 12,
+ };
+ 
+ typedef unsigned int __bitwise zap_flags_t;
+diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
+index 96e113e23d04..aab8f1b28d26 100644
+--- a/include/linux/mmap_lock.h
++++ b/include/linux/mmap_lock.h
+@@ -60,6 +60,29 @@ static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write)
+ 
+ #endif /* CONFIG_TRACING */
+ 
++static inline void mmap_assert_locked(struct mm_struct *mm)
++{
++	lockdep_assert_held(&mm->mmap_lock);
++	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
++}
++
++static inline void mmap_assert_write_locked(struct mm_struct *mm)
++{
++	lockdep_assert_held_write(&mm->mmap_lock);
++	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
++}
++
++#ifdef CONFIG_PER_VMA_LOCK
++static inline void vma_end_write_all(struct mm_struct *mm)
++{
++	mmap_assert_write_locked(mm);
++	/* No races during update due to exclusive mmap_lock being held */
++	WRITE_ONCE(mm->mm_lock_seq, mm->mm_lock_seq + 1);
++}
++#else
++static inline void vma_end_write_all(struct mm_struct *mm) {}
++#endif
++
+ static inline void mmap_init_lock(struct mm_struct *mm)
+ {
+ 	init_rwsem(&mm->mmap_lock);
+@@ -102,12 +125,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
+ static inline void mmap_write_unlock(struct mm_struct *mm)
+ {
+ 	__mmap_lock_trace_released(mm, true);
++	vma_end_write_all(mm);
+ 	up_write(&mm->mmap_lock);
+ }
+ 
+ static inline void mmap_write_downgrade(struct mm_struct *mm)
+ {
+ 	__mmap_lock_trace_acquire_returned(mm, false, true);
++	vma_end_write_all(mm);
+ 	downgrade_write(&mm->mmap_lock);
+ }
+ 
+@@ -150,18 +175,6 @@ static inline void mmap_read_unlock_non_owner(struct mm_struct *mm)
+ 	up_read_non_owner(&mm->mmap_lock);
+ }
+ 
+-static inline void mmap_assert_locked(struct mm_struct *mm)
+-{
+-	lockdep_assert_held(&mm->mmap_lock);
+-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+-}
+-
+-static inline void mmap_assert_write_locked(struct mm_struct *mm)
+-{
+-	lockdep_assert_held_write(&mm->mmap_lock);
+-	VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm);
+-}
+-
+ static inline int mmap_lock_is_contended(struct mm_struct *mm)
+ {
+ 	return rwsem_is_contended(&mm->mmap_lock);
+diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
+index 7f5d1caf5890..8abfa1240040 100644
+--- a/include/linux/vm_event_item.h
++++ b/include/linux/vm_event_item.h
+@@ -149,6 +149,12 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
+ #ifdef CONFIG_X86
+ 		DIRECT_MAP_LEVEL2_SPLIT,
+ 		DIRECT_MAP_LEVEL3_SPLIT,
++#endif
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++		VMA_LOCK_SUCCESS,
++		VMA_LOCK_ABORT,
++		VMA_LOCK_RETRY,
++		VMA_LOCK_MISS,
+ #endif
+ 		NR_VM_EVENT_ITEMS
+ };
+diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
+index 19cf5b6892ce..fed855bae6d8 100644
+--- a/include/linux/vmstat.h
++++ b/include/linux/vmstat.h
+@@ -125,6 +125,12 @@ static inline void vm_events_fold_cpu(int cpu)
+ #define count_vm_tlb_events(x, y) do { (void)(y); } while (0)
+ #endif
+ 
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++#define count_vm_vma_lock_event(x) count_vm_event(x)
++#else
++#define count_vm_vma_lock_event(x) do {} while (0)
++#endif
++
+ #define __count_zid_vm_events(item, zid, delta) \
+ 	__count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta)
+ 
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 349945168239..ebd353730887 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -455,13 +455,49 @@ static struct kmem_cache *vm_area_cachep;
+ /* SLAB cache for mm_struct structures (tsk->mm) */
+ static struct kmem_cache *mm_cachep;
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++
++/* SLAB cache for vm_area_struct.lock */
++static struct kmem_cache *vma_lock_cachep;
++
++static bool vma_lock_alloc(struct vm_area_struct *vma)
++{
++	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
++	if (!vma->vm_lock)
++		return false;
++
++	init_rwsem(&vma->vm_lock->lock);
++	vma->vm_lock_seq = -1;
++
++	return true;
++}
++
++static inline void vma_lock_free(struct vm_area_struct *vma)
++{
++	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
++}
++
++#else /* CONFIG_PER_VMA_LOCK */
++
++static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
++static inline void vma_lock_free(struct vm_area_struct *vma) {}
++
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
+ {
+ 	struct vm_area_struct *vma;
+ 
+ 	vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+-	if (vma)
+-		vma_init(vma, mm);
++	if (!vma)
++		return NULL;
++
++	vma_init(vma, mm);
++	if (!vma_lock_alloc(vma)) {
++		kmem_cache_free(vm_area_cachep, vma);
++		return NULL;
++	}
++
+ 	return vma;
+ }
+ 
+@@ -469,26 +505,54 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
+ {
+ 	struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+ 
+-	if (new) {
+-		ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
+-		ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
+-		/*
+-		 * orig->shared.rb may be modified concurrently, but the clone
+-		 * will be reinitialized.
+-		 */
+-		data_race(memcpy(new, orig, sizeof(*new)));
+-		INIT_LIST_HEAD(&new->anon_vma_chain);
+-		dup_anon_vma_name(orig, new);
++	if (!new)
++		return NULL;
++
++	ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
++	ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
++	/*
++	 * orig->shared.rb may be modified concurrently, but the clone
++	 * will be reinitialized.
++	 */
++	data_race(memcpy(new, orig, sizeof(*new)));
++	if (!vma_lock_alloc(new)) {
++		kmem_cache_free(vm_area_cachep, new);
++		return NULL;
+ 	}
++	INIT_LIST_HEAD(&new->anon_vma_chain);
++	dup_anon_vma_name(orig, new);
++
+ 	return new;
+ }
+ 
+-void vm_area_free(struct vm_area_struct *vma)
++void __vm_area_free(struct vm_area_struct *vma)
+ {
+ 	free_anon_vma_name(vma);
++	vma_lock_free(vma);
+ 	kmem_cache_free(vm_area_cachep, vma);
+ }
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++static void vm_area_free_rcu_cb(struct rcu_head *head)
++{
++	struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
++						  vm_rcu);
++
++	/* The vma should not be locked while being destroyed. */
++	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
++	__vm_area_free(vma);
++}
++#endif
++
++void vm_area_free(struct vm_area_struct *vma)
++{
++#ifdef CONFIG_PER_VMA_LOCK
++	call_rcu(&vma->vm_rcu, vm_area_free_rcu_cb);
++#else
++	__vm_area_free(vma);
++#endif
++}
++
+ static void account_kernel_stack(struct task_struct *tsk, int account)
+ {
+ 	if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+@@ -1132,6 +1196,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
+ 	seqcount_init(&mm->write_protect_seq);
+ 	mmap_init_lock(mm);
+ 	INIT_LIST_HEAD(&mm->mmlist);
++#ifdef CONFIG_PER_VMA_LOCK
++	mm->mm_lock_seq = 0;
++#endif
+ 	mm_pgtables_bytes_init(mm);
+ 	mm->map_count = 0;
+ 	mm->locked_vm = 0;
+@@ -3074,6 +3141,9 @@ void __init proc_caches_init(void)
+ 			NULL);
+ 
+ 	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
++#ifdef CONFIG_PER_VMA_LOCK
++	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
++#endif
+ 	mmap_init();
+ 	nsproxy_cache_init();
+ }
+diff --git a/mm/Kconfig b/mm/Kconfig
+index cf2e47030fe8..459af2123189 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -1202,6 +1202,18 @@ config LRU_GEN_STATS
+ 	  This option has a per-memcg and per-node memory overhead.
+ # }
+ 
++config ARCH_SUPPORTS_PER_VMA_LOCK
++       def_bool n
++
++config PER_VMA_LOCK
++	def_bool y
++	depends on ARCH_SUPPORTS_PER_VMA_LOCK && MMU && SMP
++	help
++	  Allow per-vma locking during page fault handling.
++
++	  This feature allows locking each virtual memory area separately when
++	  handling page faults instead of taking mmap_lock.
++
+ source "mm/damon/Kconfig"
+ 
+ endmenu
+diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
+index c3547a373c9c..4965a7333a3f 100644
+--- a/mm/Kconfig.debug
++++ b/mm/Kconfig.debug
+@@ -279,3 +279,9 @@ config DEBUG_KMEMLEAK_AUTO_SCAN
+ 
+ 	  If unsure, say Y.
+ 
++config PER_VMA_LOCK_STATS
++	bool "Statistics for per-vma locks"
++	depends on PER_VMA_LOCK
++	default y
++	help
++	  Statistics for per-vma locks.
+diff --git a/mm/init-mm.c b/mm/init-mm.c
+index c9327abb771c..33269314e060 100644
+--- a/mm/init-mm.c
++++ b/mm/init-mm.c
+@@ -37,6 +37,9 @@ struct mm_struct init_mm = {
+ 	.page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
+ 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
+ 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist),
++#ifdef CONFIG_PER_VMA_LOCK
++	.mm_lock_seq	= 0,
++#endif
+ 	.user_ns	= &init_user_ns,
+ 	.cpu_bitmap	= CPU_BITS_NONE,
+ #ifdef CONFIG_IOMMU_SVA
+diff --git a/mm/internal.h b/mm/internal.h
+index 7920a8b7982e..0c455d6e4e3e 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -105,7 +105,7 @@ void folio_activate(struct folio *folio);
+ 
+ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		   struct vm_area_struct *start_vma, unsigned long floor,
+-		   unsigned long ceiling);
++		   unsigned long ceiling, bool mm_wr_locked);
+ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
+ 
+ struct zap_details;
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 0ec69b96b497..37a52a0ec9da 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -1053,6 +1053,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
+ 	if (result != SCAN_SUCCEED)
+ 		goto out_up_write;
+ 
++	vma_start_write(vma);
+ 	anon_vma_lock_write(vma->anon_vma);
+ 
+ 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, address,
+@@ -1516,6 +1517,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
+ 		goto drop_hpage;
+ 	}
+ 
++	/* Lock the vma before taking i_mmap and page table locks */
++	vma_start_write(vma);
++
+ 	/*
+ 	 * We need to lock the mapping so that from here on, only GUP-fast and
+ 	 * hardware page walks can access the parts of the page tables that
+@@ -1693,6 +1697,10 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+ 		result = SCAN_PTE_MAPPED_HUGEPAGE;
+ 		if ((cc->is_khugepaged || is_target) &&
+ 		    mmap_write_trylock(mm)) {
++			/* trylock for the same lock inversion as above */
++			if (!vma_try_start_write(vma))
++				goto unlock_next;
++
+ 			/*
+ 			 * Re-check whether we have an ->anon_vma, because
+ 			 * collapse_and_free_pmd() requires that either no
+diff --git a/mm/memory.c b/mm/memory.c
+index 01a23ad48a04..c76183ced67a 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -348,7 +348,7 @@ void free_pgd_range(struct mmu_gather *tlb,
+ 
+ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		   struct vm_area_struct *vma, unsigned long floor,
+-		   unsigned long ceiling)
++		   unsigned long ceiling, bool mm_wr_locked)
+ {
+ 	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
+ 
+@@ -366,6 +366,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 		 * Hide vma from rmap and truncate_pagecache before freeing
+ 		 * pgtables
+ 		 */
++		if (mm_wr_locked)
++			vma_start_write(vma);
+ 		unlink_anon_vmas(vma);
+ 		unlink_file_vma(vma);
+ 
+@@ -380,6 +382,8 @@ void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
+ 			       && !is_vm_hugetlb_page(next)) {
+ 				vma = next;
+ 				next = mas_find(&mas, ceiling - 1);
++				if (mm_wr_locked)
++					vma_start_write(vma);
+ 				unlink_anon_vmas(vma);
+ 				unlink_file_vma(vma);
+ 			}
+@@ -3698,6 +3702,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
+ 	if (!pte_unmap_same(vmf))
+ 		goto out;
+ 
++	if (vmf->flags & FAULT_FLAG_VMA_LOCK) {
++		ret = VM_FAULT_RETRY;
++		goto out;
++	}
++
+ 	entry = pte_to_swp_entry(vmf->orig_pte);
+ 	if (unlikely(non_swap_entry(entry))) {
+ 		if (is_migration_entry(entry)) {
+@@ -5230,6 +5239,67 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ }
+ EXPORT_SYMBOL_GPL(handle_mm_fault);
+ 
++#ifdef CONFIG_PER_VMA_LOCK
++/*
++ * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
++ * stable and not isolated. If the VMA is not found or is being modified the
++ * function returns NULL.
++ */
++struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
++					  unsigned long address)
++{
++	MA_STATE(mas, &mm->mm_mt, address, address);
++	struct vm_area_struct *vma;
++
++	rcu_read_lock();
++retry:
++	vma = mas_walk(&mas);
++	if (!vma)
++		goto inval;
++
++	/* Only anonymous vmas are supported for now */
++	if (!vma_is_anonymous(vma))
++		goto inval;
++
++	/* find_mergeable_anon_vma uses adjacent vmas which are not locked */
++	if (!vma->anon_vma)
++		goto inval;
++
++	if (!vma_start_read(vma))
++		goto inval;
++
++	/*
++	 * Due to the possibility of userfault handler dropping mmap_lock, avoid
++	 * it for now and fall back to page fault handling under mmap_lock.
++	 */
++	if (userfaultfd_armed(vma)) {
++		vma_end_read(vma);
++		goto inval;
++	}
++
++	/* Check since vm_start/vm_end might change before we lock the VMA */
++	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
++		vma_end_read(vma);
++		goto inval;
++	}
++
++	/* Check if the VMA got isolated after we found it */
++	if (vma->detached) {
++		vma_end_read(vma);
++		count_vm_vma_lock_event(VMA_LOCK_MISS);
++		/* The area was replaced with another one */
++		goto retry;
++	}
++
++	rcu_read_unlock();
++	return vma;
++inval:
++	rcu_read_unlock();
++	count_vm_vma_lock_event(VMA_LOCK_ABORT);
++	return NULL;
++}
++#endif /* CONFIG_PER_VMA_LOCK */
++
+ #ifndef __PAGETABLE_P4D_FOLDED
+ /*
+  * Allocate p4d page table.
+diff --git a/mm/mmap.c b/mm/mmap.c
+index d5475fbf5729..cbac45aa39ae 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -133,7 +133,7 @@ void unlink_file_vma(struct vm_area_struct *vma)
+ /*
+  * Close a vm structure and free it.
+  */
+-static void remove_vma(struct vm_area_struct *vma)
++static void remove_vma(struct vm_area_struct *vma, bool unreachable)
+ {
+ 	might_sleep();
+ 	if (vma->vm_ops && vma->vm_ops->close)
+@@ -141,7 +141,10 @@ static void remove_vma(struct vm_area_struct *vma)
+ 	if (vma->vm_file)
+ 		fput(vma->vm_file);
+ 	mpol_put(vma_policy(vma));
+-	vm_area_free(vma);
++	if (unreachable)
++		__vm_area_free(vma);
++	else
++		vm_area_free(vma);
+ }
+ 
+ static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
+@@ -502,6 +505,15 @@ static inline void init_vma_prep(struct vma_prepare *vp,
+  */
+ static inline void vma_prepare(struct vma_prepare *vp)
+ {
++	vma_start_write(vp->vma);
++	if (vp->adj_next)
++		vma_start_write(vp->adj_next);
++	/* vp->insert is always a newly created VMA, no need for locking */
++	if (vp->remove)
++		vma_start_write(vp->remove);
++	if (vp->remove2)
++		vma_start_write(vp->remove2);
++
+ 	if (vp->file) {
+ 		uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+ 
+@@ -590,6 +602,7 @@ static inline void vma_complete(struct vma_prepare *vp,
+ 
+ 	if (vp->remove) {
+ again:
++		vma_mark_detached(vp->remove, true);
+ 		if (vp->file) {
+ 			uprobe_munmap(vp->remove, vp->remove->vm_start,
+ 				      vp->remove->vm_end);
+@@ -683,12 +696,12 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 	if (vma_iter_prealloc(vmi))
+ 		goto nomem;
+ 
++	vma_prepare(&vp);
+ 	vma_adjust_trans_huge(vma, start, end, 0);
+ 	/* VMA iterator points to previous, so set to start if necessary */
+ 	if (vma_iter_addr(vmi) != start)
+ 		vma_iter_set(vmi, start);
+ 
+-	vma_prepare(&vp);
+ 	vma->vm_start = start;
+ 	vma->vm_end = end;
+ 	vma->vm_pgoff = pgoff;
+@@ -723,8 +736,8 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 		return -ENOMEM;
+ 
+ 	init_vma_prep(&vp, vma);
+-	vma_adjust_trans_huge(vma, start, end, 0);
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, start, end, 0);
+ 
+ 	if (vma->vm_start < start)
+ 		vma_iter_clear(vmi, vma->vm_start, start);
+@@ -994,12 +1007,12 @@ struct vm_area_struct *vma_merge(struct vma_iterator *vmi, struct mm_struct *mm,
+ 	if (vma_iter_prealloc(vmi))
+ 		return NULL;
+ 
+-	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
+ 	init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
+ 	VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ 		   vp.anon_vma != adjust->anon_vma);
+ 
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, vma_start, vma_end, adj_next);
+ 	if (vma_start < vma->vm_start || vma_end > vma->vm_end)
+ 		vma_expanded = true;
+ 
+@@ -2157,7 +2170,7 @@ static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
+ 		if (vma->vm_flags & VM_ACCOUNT)
+ 			nr_accounted += nrpages;
+ 		vm_stat_account(mm, vma->vm_flags, -nrpages);
+-		remove_vma(vma);
++		remove_vma(vma, false);
+ 	}
+ 	vm_unacct_memory(nr_accounted);
+ 	validate_mm(mm);
+@@ -2180,7 +2193,8 @@ static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
+ 	update_hiwater_rss(mm);
+ 	unmap_vmas(&tlb, mt, vma, start, end, mm_wr_locked);
+ 	free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+-				 next ? next->vm_start : USER_PGTABLES_CEILING);
++				 next ? next->vm_start : USER_PGTABLES_CEILING,
++				 mm_wr_locked);
+ 	tlb_finish_mmu(&tlb);
+ }
+ 
+@@ -2236,10 +2250,10 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 	if (new->vm_ops && new->vm_ops->open)
+ 		new->vm_ops->open(new);
+ 
+-	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ 	init_vma_prep(&vp, vma);
+ 	vp.insert = new;
+ 	vma_prepare(&vp);
++	vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+ 
+ 	if (new_below) {
+ 		vma->vm_start = addr;
+@@ -2283,10 +2297,12 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ static inline int munmap_sidetree(struct vm_area_struct *vma,
+ 				   struct ma_state *mas_detach)
+ {
++	vma_start_write(vma);
+ 	mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1);
+ 	if (mas_store_gfp(mas_detach, vma, GFP_KERNEL))
+ 		return -ENOMEM;
+ 
++	vma_mark_detached(vma, true);
+ 	if (vma->vm_flags & VM_LOCKED)
+ 		vma->vm_mm->locked_vm -= vma_pages(vma);
+ 
+@@ -2942,9 +2958,9 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ 		if (vma_iter_prealloc(vmi))
+ 			goto unacct_fail;
+ 
+-		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ 		init_vma_prep(&vp, vma);
+ 		vma_prepare(&vp);
++		vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
+ 		vma->vm_end = addr + len;
+ 		vm_flags_set(vma, VM_SOFTDIRTY);
+ 		vma_iter_store(vmi, vma);
+@@ -3077,7 +3093,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	mmap_write_lock(mm);
+ 	mt_clear_in_rcu(&mm->mm_mt);
+ 	free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
+-		      USER_PGTABLES_CEILING);
++		      USER_PGTABLES_CEILING, true);
+ 	tlb_finish_mmu(&tlb);
+ 
+ 	/*
+@@ -3088,7 +3104,7 @@ void exit_mmap(struct mm_struct *mm)
+ 	do {
+ 		if (vma->vm_flags & VM_ACCOUNT)
+ 			nr_accounted += vma_pages(vma);
+-		remove_vma(vma);
++		remove_vma(vma, true);
+ 		count++;
+ 		cond_resched();
+ 	} while ((vma = mas_find(&mas, ULONG_MAX)) != NULL);
+@@ -3211,6 +3227,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ 			get_file(new_vma->vm_file);
+ 		if (new_vma->vm_ops && new_vma->vm_ops->open)
+ 			new_vma->vm_ops->open(new_vma);
++		vma_start_write(new_vma);
+ 		if (vma_link(mm, new_vma))
+ 			goto out_vma_link;
+ 		*need_rmap_locks = false;
+@@ -3505,6 +3522,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+  * of mm/rmap.c:
+  *   - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+  *     hugetlb mapping);
++ *   - all vmas marked locked
+  *   - all i_mmap_rwsem locks;
+  *   - all anon_vma->rwseml
+  *
+@@ -3527,6 +3545,13 @@ int mm_take_all_locks(struct mm_struct *mm)
+ 
+ 	mutex_lock(&mm_all_locks_mutex);
+ 
++	mas_for_each(&mas, vma, ULONG_MAX) {
++		if (signal_pending(current))
++			goto out_unlock;
++		vma_start_write(vma);
++	}
++
++	mas_set(&mas, 0);
+ 	mas_for_each(&mas, vma, ULONG_MAX) {
+ 		if (signal_pending(current))
+ 			goto out_unlock;
+@@ -3616,6 +3641,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
+ 		if (vma->vm_file && vma->vm_file->f_mapping)
+ 			vm_unlock_mapping(vma->vm_file->f_mapping);
+ 	}
++	vma_end_write_all(mm);
+ 
+ 	mutex_unlock(&mm_all_locks_mutex);
+ }
+diff --git a/mm/mremap.c b/mm/mremap.c
+index 411a85682b58..dd541e59edda 100644
+--- a/mm/mremap.c
++++ b/mm/mremap.c
+@@ -623,6 +623,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
+ 			return -ENOMEM;
+ 	}
+ 
++	vma_start_write(vma);
+ 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+ 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
+ 			   &need_rmap_locks);
+diff --git a/mm/rmap.c b/mm/rmap.c
+index 8632e02661ac..cfdaa56cad3e 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -25,21 +25,22 @@
+  *     mapping->invalidate_lock (in filemap_fault)
+  *       page->flags PG_locked (lock_page)
+  *         hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below)
+- *           mapping->i_mmap_rwsem
+- *             anon_vma->rwsem
+- *               mm->page_table_lock or pte_lock
+- *                 swap_lock (in swap_duplicate, swap_info_get)
+- *                   mmlist_lock (in mmput, drain_mmlist and others)
+- *                   mapping->private_lock (in block_dirty_folio)
+- *                     folio_lock_memcg move_lock (in block_dirty_folio)
+- *                       i_pages lock (widely used)
+- *                         lruvec->lru_lock (in folio_lruvec_lock_irq)
+- *                   inode->i_lock (in set_page_dirty's __mark_inode_dirty)
+- *                   bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
+- *                     sb_lock (within inode_lock in fs/fs-writeback.c)
+- *                     i_pages lock (widely used, in set_page_dirty,
+- *                               in arch-dependent flush_dcache_mmap_lock,
+- *                               within bdi.wb->list_lock in __sync_single_inode)
++ *           vma_start_write
++ *             mapping->i_mmap_rwsem
++ *               anon_vma->rwsem
++ *                 mm->page_table_lock or pte_lock
++ *                   swap_lock (in swap_duplicate, swap_info_get)
++ *                     mmlist_lock (in mmput, drain_mmlist and others)
++ *                     mapping->private_lock (in block_dirty_folio)
++ *                       folio_lock_memcg move_lock (in block_dirty_folio)
++ *                         i_pages lock (widely used)
++ *                           lruvec->lru_lock (in folio_lruvec_lock_irq)
++ *                     inode->i_lock (in set_page_dirty's __mark_inode_dirty)
++ *                     bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
++ *                       sb_lock (within inode_lock in fs/fs-writeback.c)
++ *                       i_pages lock (widely used, in set_page_dirty,
++ *                                 in arch-dependent flush_dcache_mmap_lock,
++ *                                 within bdi.wb->list_lock in __sync_single_inode)
+  *
+  * anon_vma->rwsem,mapping->i_mmap_rwsem   (memory_failure, collect_procs_anon)
+  *   ->tasklist_lock
+diff --git a/mm/vmstat.c b/mm/vmstat.c
+index 1ea6a5ce1c41..4f1089a1860e 100644
+--- a/mm/vmstat.c
++++ b/mm/vmstat.c
+@@ -1399,6 +1399,12 @@ const char * const vmstat_text[] = {
+ 	"direct_map_level2_splits",
+ 	"direct_map_level3_splits",
+ #endif
++#ifdef CONFIG_PER_VMA_LOCK_STATS
++	"vma_lock_success",
++	"vma_lock_abort",
++	"vma_lock_retry",
++	"vma_lock_miss",
++#endif
+ #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
+ };
+ #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
+-- 
+2.40.1
+
+From 56fd0f1397471be0786d1f696598173b9ebb9a35 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Tue, 25 Apr 2023 17:19:06 +0200
-Subject: [PATCH 7/8] sched
+Subject: [PATCH 09/10] sched
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -11142,10 +27576,10 @@ index 3e8df6d31c1e..7331d436ebc4 100644
 -- 
 2.40.1
 
-From 27d4dbfc6971caf5627a8248adef49f8d15340b4 Mon Sep 17 00:00:00 2001
+From fed8faa97161f725528a30330a22a3ba5b8e9965 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Sat, 22 Apr 2023 11:46:46 +0200
-Subject: [PATCH 8/8] zstd: import 1.5.5
+Subject: [PATCH 10/10] zstd: import 1.5.5
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---