diff --git a/config b/config index 2f1aa34..918c0e3 100644 --- a/config +++ b/config @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.4.1 Kernel Configuration +# Linux/x86 6.4.3 Kernel Configuration # CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.1.1 20230525" CONFIG_CC_IS_GCC=y @@ -4684,7 +4684,6 @@ CONFIG_XILLYBUS_CLASS=m CONFIG_XILLYBUS=m CONFIG_XILLYBUS_PCIE=m CONFIG_XILLYUSB=m -CONFIG_DDCCI=m # end of Character devices # @@ -6942,7 +6941,6 @@ CONFIG_BACKLIGHT_MAX8925=m CONFIG_BACKLIGHT_MT6370=m CONFIG_BACKLIGHT_APPLE=m CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_DDCCI=m CONFIG_BACKLIGHT_RT4831=m CONFIG_BACKLIGHT_SAHARA=m CONFIG_BACKLIGHT_WM831X=m diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index 3a7aff5..ee56662 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,7 +1,7 @@ -From 4ab0a89f6041a700bfc9b341d37dfdceafa3ddc9 Mon Sep 17 00:00:00 2001 +From a2168c50c2c846ad624b028bbca121f11b732a95 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Tue, 9 May 2023 18:38:36 +0200 -Subject: [PATCH 1/8] bbr2 +Date: Tue, 11 Jul 2023 19:24:11 +0200 +Subject: [PATCH 1/7] bbr2 Signed-off-by: Peter Jung --- @@ -3283,16 +3283,14 @@ index 39eb947fe392..61ab4ee55b22 100644 -- 2.41.0 -From b0be3056917c37ac3cf69603878e5d66d8c2bdad Mon Sep 17 00:00:00 2001 +From 583c46f67e8db3fb6478523ff297ab3f469186ba Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 1 Jul 2023 15:11:26 +0200 -Subject: [PATCH 2/8] cachy +Date: Tue, 11 Jul 2023 19:24:37 +0200 +Subject: [PATCH 2/7] cachy Signed-off-by: Peter Jung --- - .gitignore | 1 + .../admin-guide/kernel-parameters.txt | 12 + - Documentation/dontdiff | 1 + Makefile | 8 +- arch/arc/configs/axs101_defconfig | 1 + arch/arc/configs/axs103_defconfig | 1 + @@ -3309,9 +3307,8 @@ Signed-off-by: Peter Jung arch/arc/configs/vdk_hs38_smp_defconfig | 1 + arch/x86/Kconfig.cpu | 427 ++- arch/x86/Makefile | 46 +- - arch/x86/Makefile.postlink | 41 + - arch/x86/boot/compressed/.gitignore | 1 - - arch/x86/boot/compressed/Makefile | 10 +- + arch/x86/Makefile.postlink | 47 + + arch/x86/boot/compressed/Makefile | 8 +- arch/x86/include/asm/pci.h | 6 + arch/x86/include/asm/vermagic.h | 74 + arch/x86/pci/common.c | 7 +- @@ -3346,25 +3343,13 @@ Signed-off-by: Peter Jung mm/swap.c | 5 + mm/vmpressure.c | 4 + mm/vmscan.c | 8 + - 56 files changed, 5341 insertions(+), 61 deletions(-) + 53 files changed, 5344 insertions(+), 59 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/pci/controller/intel-nvme-remap.c create mode 100644 drivers/platform/x86/legion-laptop.c create mode 100644 drivers/platform/x86/steamdeck.c -diff --git a/.gitignore b/.gitignore -index 7f86e0837909..7cb55784696c 100644 ---- a/.gitignore -+++ b/.gitignore -@@ -65,6 +65,7 @@ modules.order - /vmlinux - /vmlinux.32 - /vmlinux.map -+/vmlinux.relocs - /vmlinux.symvers - /vmlinux-gdb.py - /vmlinuz diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685f..794e7a91219a 100644 --- a/Documentation/admin-guide/kernel-parameters.txt @@ -3395,20 +3380,8 @@ index 9e5bab29685f..794e7a91219a 100644 noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. -diff --git a/Documentation/dontdiff b/Documentation/dontdiff -index 3c399f132e2d..a62ad01e6d11 100644 ---- a/Documentation/dontdiff -+++ b/Documentation/dontdiff -@@ -254,6 +254,7 @@ vmlinux.aout - vmlinux.bin.all - vmlinux.lds - vmlinux.map -+vmlinux.relocs - vmlinux.symvers - vmlinuz - voffset.h diff --git a/Makefile b/Makefile -index 9f6376cbafeb..26f594d699d5 100644 +index 56abbcac061d..c7cd86bb99e4 100644 --- a/Makefile +++ b/Makefile @@ -818,6 +818,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) @@ -4188,10 +4161,10 @@ index fdc2e3abd615..c7463d290ce7 100644 diff --git a/arch/x86/Makefile.postlink b/arch/x86/Makefile.postlink new file mode 100644 -index 000000000000..195af937aa4d +index 000000000000..936093d29160 --- /dev/null +++ b/arch/x86/Makefile.postlink -@@ -0,0 +1,41 @@ +@@ -0,0 +1,47 @@ +# SPDX-License-Identifier: GPL-2.0 +# =========================================================================== +# Post-link x86 pass @@ -4207,11 +4180,17 @@ index 000000000000..195af937aa4d +include $(srctree)/scripts/Kbuild.include + +CMD_RELOCS = arch/x86/tools/relocs -+quiet_cmd_relocs = RELOCS $@.relocs -+ cmd_relocs = $(CMD_RELOCS) $@ > $@.relocs;$(CMD_RELOCS) --abs-relocs $@ ++OUT_RELOCS = arch/x86/boot/compressed ++quiet_cmd_relocs = RELOCS $(OUT_RELOCS)/$@.relocs ++ cmd_relocs = \ ++ mkdir -p $(OUT_RELOCS); \ ++ $(CMD_RELOCS) $@ > $(OUT_RELOCS)/$@.relocs; \ ++ $(CMD_RELOCS) --abs-relocs $@ + +quiet_cmd_strip_relocs = RSTRIP $@ -+ cmd_strip_relocs = $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' --remove-section='.rela.*' --remove-section='.rela__*' $@ ++ cmd_strip_relocs = \ ++ $(OBJCOPY) --remove-section='.rel.*' --remove-section='.rel__*' \ ++ --remove-section='.rela.*' --remove-section='.rela__*' $@ + +# `@true` prevents complaint when there is nothing to be done + @@ -4226,30 +4205,18 @@ index 000000000000..195af937aa4d + @true + +clean: -+ @rm -f vmlinux.relocs ++ @rm -f $(OUT_RELOCS)/vmlinux.relocs + +PHONY += FORCE clean + +FORCE: + +.PHONY: $(PHONY) -diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore -index 25805199a506..b2968175fc27 100644 ---- a/arch/x86/boot/compressed/.gitignore -+++ b/arch/x86/boot/compressed/.gitignore -@@ -1,7 +1,6 @@ - # SPDX-License-Identifier: GPL-2.0-only - relocs - vmlinux.bin.all --vmlinux.relocs - vmlinux.lds - mkpiggy - piggy.S diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile -index 6b6cfe607bdb..19d1fb601796 100644 +index 6b6cfe607bdb..0f78dbbbdcdd 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile -@@ -121,14 +121,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE +@@ -121,11 +121,9 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs @@ -4259,15 +4226,11 @@ index 6b6cfe607bdb..19d1fb601796 100644 -$(obj)/vmlinux.relocs: vmlinux FORCE - $(call if_changed,relocs) +# vmlinux.relocs is created by the vmlinux postlink step. -+vmlinux.relocs: vmlinux ++$(obj)/vmlinux.relocs: vmlinux + @true vmlinux.bin.all-y := $(obj)/vmlinux.bin --vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs -+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += vmlinux.relocs - - $(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE - $(call if_changed,gzip) + vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index b40c462b4af3..c4e66e60d559 100644 --- a/arch/x86/include/asm/pci.h @@ -9255,7 +9218,7 @@ index 000000000000..77a6677ec19e +MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/mm.h b/include/linux/mm.h -index 6cbcc55a80b0..f80819d5a23e 100644 +index 9e10485f37e7..3c6c4c836da7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -191,7 +191,7 @@ static inline void __mm_zero_struct_page(struct page *page) @@ -9396,7 +9359,7 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index 41c964104b58..915ad6dae416 100644 +index 8103ffd217e9..f405763e06ae 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -100,6 +100,10 @@ @@ -9410,7 +9373,7 @@ index 41c964104b58..915ad6dae416 100644 #include #include #include -@@ -2266,6 +2270,10 @@ __latent_entropy struct task_struct *copy_process( +@@ -2267,6 +2271,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -9421,7 +9384,7 @@ index 41c964104b58..915ad6dae416 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3419,6 +3427,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3420,6 +3428,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -9651,2810 +9614,10 @@ index 5bf98d0a22c9..28f6d5cd362e 100644 -- 2.41.0 -From b10c24b78e0fe1d76eca435913d54013ff35f9ce Mon Sep 17 00:00:00 2001 +From 3a68ae439de252da49e718998385b91b69809642 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Tue, 9 May 2023 18:39:03 +0200 -Subject: [PATCH 3/8] ddcci - -Signed-off-by: Peter Jung ---- - Documentation/ABI/testing/sysfs-driver-ddcci | 65 + - Documentation/driver-api/ddcci.rst | 130 ++ - drivers/char/Kconfig | 11 + - drivers/char/Makefile | 1 + - drivers/char/ddcci.c | 1909 ++++++++++++++++++ - drivers/video/backlight/Kconfig | 11 + - drivers/video/backlight/Makefile | 1 + - drivers/video/backlight/ddcci-backlight.c | 413 ++++ - include/linux/ddcci.h | 164 ++ - 9 files changed, 2705 insertions(+) - create mode 100644 Documentation/ABI/testing/sysfs-driver-ddcci - create mode 100644 Documentation/driver-api/ddcci.rst - create mode 100644 drivers/char/ddcci.c - create mode 100644 drivers/video/backlight/ddcci-backlight.c - create mode 100644 include/linux/ddcci.h - -diff --git a/Documentation/ABI/testing/sysfs-driver-ddcci b/Documentation/ABI/testing/sysfs-driver-ddcci -new file mode 100644 -index 000000000000..55c07ec13c32 ---- /dev/null -+++ b/Documentation/ABI/testing/sysfs-driver-ddcci -@@ -0,0 +1,65 @@ -+What: /sys/bus/ddcci/ddccii -+Date: March 2022 -+KernelVersion: 5.18 -+Contact: Christoph Grenz -+Description: This file is a user interface for an internal -+ dependent device on the I2C bus, it exports the same -+ information as the master device(/sys/bus/ddcci/ -+ ddcci) that is referenced in this -+ document. -+ -+What: /sys/bus/ddcci/ddccie -+Date: March 2022 -+KernelVersion: 5.18 -+Contact: Christoph Grenz -+Description: This file is a user interface for an external -+ dependent device on the I2C bus, it exports the same -+ information as the master device(/sys/bus/ddcci/ -+ ddcci) that is referenced in this -+ document. -+ -+What: /sys/bus/ddcci/ddcci -+Date: March 2022 -+KernelVersion: 5.18 -+Contact: Christoph Grenz -+Description: This file provides the user interface for the -+ master device on the I2C bus. It exports the following -+ peices of information: -+ -+ - idProt -+ -+ ACCESS.bus protocol supported by the device. Usually -+ "monitor". -+ -+ - idType -+ -+ ACCESS.bus device subtype. Usually "LCD" or "CRT". -+ -+ - idModel -+ -+ ACCESS.bus device model identifier. Usually a -+ shortened form of the device model name. -+ -+ - idVendor -+ -+ ACCESS.bus device vendor identifier. Empty if the -+ Identification command is not supported. -+ -+ - idModule -+ -+ ACCESS.bus device module identifier. Empty if the -+ Identification command is not supported. -+ -+ - idSerial -+ -+ 32 bit device number. A fixed serial number if it's -+ positive, a temporary serial number if negative and zero -+ if the Identification command is not supported. -+ -+ - modalias -+ -+ A combined identifier for driver selection. It has the form: -+ ddcci:----. -+ All non-alphanumeric characters (including whitespace) -+ in the model, vendor or module parts are replaced by -+ underscores to prevent issues with software like systemd-udevd. -diff --git a/Documentation/driver-api/ddcci.rst b/Documentation/driver-api/ddcci.rst -new file mode 100644 -index 000000000000..04cd5ab911a7 ---- /dev/null -+++ b/Documentation/driver-api/ddcci.rst -@@ -0,0 +1,130 @@ -+.. SPDX-License-Identifier: GPL-2.0-or-later -+ -+============== -+DDC/CI -+============== -+ -+1. Introduction -+=============== -+DDC/CI is a control protocol for monitor settings supported by most -+monitors since about 2005. It is based on ACCESS.bus (an early USB predecessor). -+This could be used to create drivers that communicate with the DDCCI component, -+see ddcci-backlight for an example. -+ -+2. sysfs interface -+================== -+Each detected DDC/CI device gets a directory in /sys/bus/ddcci/devices. -+The main device on a bus is named ddcci[I2C bus number]. -+Internal dependent devices are named ddcci[I2C bus number]i[hex address] -+External dependent devices are named ddcci[I2C bus number]e[hex address] -+There the following files export information about the device: -+ -+- capabilities -+ -+The full ACCESS.bus capabilities string. It contains the protocol, -+type and model of the device, a list of all supported command -+codes, etc. See the ACCESS.bus spec for more information. -+ -+- idProt -+ -+ACCESS.bus protocol supported by the device. Usually "monitor". -+ -+- idType -+ -+ACCESS.bus device subtype. Usually "LCD" or "CRT". -+ -+- idModel -+ -+ACCESS.bus device model identifier. Usually a shortened form of the -+device model name. -+ -+- idVendor -+ -+ACCESS.bus device vendor identifier. Empty if the Identification command -+is not supported. -+ -+- idModule -+ -+ACCESS.bus device module identifier. Empty if the Identification command -+is not supported. -+ -+- idSerial -+ -+32 bit device number. A fixed serial number if it's positive, a temporary -+serial number if negative and zero if the -+Identification command is not supported. -+ -+- modalias -+ -+A combined identifier for driver selection. It has the form: -+ddcci:----. -+All non-alphanumeric characters (including whitespace) in the model, -+vendor or module parts are replaced by underscores to prevent issues -+with software like systemd-udevd. -+ -+3. Character device interface -+============================= -+For each DDC/CI device a character device in -+/dev/bus/ddcci/[I2C bus number]/ is created, -+127 devices are assigned in total. -+ -+The main device on the bus is named display. -+ -+Internal dependent devices are named i[hex address] -+ -+External dependent devices are named e[hex address] -+ -+These character devices can be used to issue commands to a DDC/CI device -+more easily than over i2c-dev devices. They should be opened unbuffered. -+To send a command just write the command byte and the arguments with a -+single write() operation. The length byte and checksum are automatically -+calculated. -+ -+To read a response use read() with a buffer big enough for the expected answer. -+ -+NOTE: The maximum length of a DDC/CI message is 32 bytes. -+ -+4. ddcci-backlight (monitor backlight driver) -+============================================= -+[This is not specific to the DDC/CI backlight driver, if you already dealt with -+backlight drivers, skip over this.] -+ -+For each monitor that supports accessing the Backlight Level White -+or the Luminance property, a backlight device of type "raw" named like the -+corresponding ddcci device is created. You can find them in /sys/class/backlight/. -+For convenience a symlink "ddcci_backlight" on the device associated with the -+display connector in /sys/class/drm/ to the backlight device is created, as -+long as the graphics driver allows to make this association. -+ -+5. Limitations -+============== -+ -+-Dependent devices (sub devices using DDC/CI directly wired to the monitor, -+like Calibration devices, IR remotes, etc.) aren't automatically detected. -+You can force detection of external dependent devices by writing -+"ddcci-dependent [address]" into /sys/bus/i2c/i2c-?/new_device. -+ -+There is no direct synchronization if you manually change the luminance -+with the buttons on your monitor, as this can only be realized through polling -+and some monitors close their OSD every time a DDC/CI command is received. -+ -+Monitor hotplugging is not detected. You need to detach/reattach the I2C driver -+or reload the module. -+ -+6. Debugging -+============ -+Both drivers use the dynamic debugging feature of the Linux kernel. -+To get detailed debugging messages, set the dyndbg module parameter. -+If you want to enable debugging permanently across reboots, create a file -+/etc/modprobe.d/ddcci.conf containing lines like the following before loading the modules: -+ -+options ddcci dyndbg -+options ddcci-backlight dyndbg -+ -+7. Origin -+============ -+This driver originally came from Christoph Grenz in DKMS form here: -+https://gitlab.com/ddcci-driver-linux/ddcci-driver-linux -+with multiple backups available on the wayback machine. It also -+inlcudes a example program for the usage of this driver in -+userland. -diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig -index 801d6c83f896..c8b3d0b8fe29 100644 ---- a/drivers/char/Kconfig -+++ b/drivers/char/Kconfig -@@ -421,4 +421,15 @@ config ADI - and SSM (Silicon Secured Memory). Intended consumers of this - driver include crash and makedumpfile. - -+config DDCCI -+ tristate "DDCCI display protocol support" -+ depends on I2C -+ help -+ Display Data Channel Command Interface is an -+ interface that allows the kernel to "talk" -+ to most displays made after 2005. Check your -+ display's specification to see if it has -+ support for this. This depends on I2C to -+ compile. -+ - endmenu -diff --git a/drivers/char/Makefile b/drivers/char/Makefile -index c5f532e412f1..b12476014311 100644 ---- a/drivers/char/Makefile -+++ b/drivers/char/Makefile -@@ -3,6 +3,7 @@ - # Makefile for the kernel character device drivers. - # - -+obj-$(CONFIG_DDCCI) += ddcci.o - obj-y += mem.o random.o - obj-$(CONFIG_TTY_PRINTK) += ttyprintk.o - obj-y += misc.o -diff --git a/drivers/char/ddcci.c b/drivers/char/ddcci.c -new file mode 100644 -index 000000000000..129aede43651 ---- /dev/null -+++ b/drivers/char/ddcci.c -@@ -0,0 +1,1909 @@ -+/* -+ * DDC/CI sub-bus driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#define DDCCI_RECV_BUFFER_SIZE 130 -+#define DEVICE_NAME "ddcci" -+#define DDCCI_MAX_CAP_CHUNKS 200 -+ -+static unsigned int delay = 60; -+static unsigned short autoprobe_addrs[127] = {0xF0, 0xF2, 0xF4, 0xF6, 0xF8}; -+static int autoprobe_addr_count = 5; -+ -+static dev_t ddcci_cdev_first; -+static dev_t ddcci_cdev_next; -+static dev_t ddcci_cdev_end; -+static DEFINE_SEMAPHORE(core_lock, 1); -+ -+struct bus_type ddcci_bus_type; -+EXPORT_SYMBOL_GPL(ddcci_bus_type); -+static bool ddcci_bus_registered; -+ -+/* Assert neccessary string array sizes */ -+#ifndef sizeof_field -+# define sizeof_field(t,m) FIELD_SIZEOF(t,m) -+#endif -+static_assert(sizeof_field(struct ddcci_device, prot) > 8); -+static_assert(sizeof_field(struct ddcci_device, type) > 8); -+static_assert(sizeof_field(struct ddcci_device, model) > 8); -+static_assert(sizeof_field(struct ddcci_device, vendor) > 8); -+static_assert(sizeof_field(struct ddcci_device, module) > 8); -+ -+/* Internal per-i2c-client driver data */ -+struct ddcci_bus_drv_data { -+ unsigned long quirks; -+ struct i2c_client *i2c_dev; -+ struct semaphore sem; -+ unsigned char recv_buffer[DDCCI_RECV_BUFFER_SIZE]; -+}; -+ -+/* Replace non-alphanumeric characters in a string (used for modalias) */ -+static void ddcci_modalias_clean(char *string, size_t n, char replacement) -+{ -+ int i; -+ for (i = 0; i < n; ++i) { -+ char c = string[i]; -+ if (c == 0) { -+ return; -+ } else if (c < '0' || (c > '9' && c < 'A') || (c > 'Z' && c < 'a') || c > 'z') { -+ string[i] = replacement; -+ } -+ } -+} -+ -+/* Write a message to the DDC/CI bus using i2c_smbus_write_byte() */ -+static int __ddcci_write_bytewise(struct i2c_client *client, unsigned char addr, -+ bool p_flag, const unsigned char * __restrict buf, -+ unsigned char len) -+{ -+ int ret = 0; -+ unsigned char outer_addr = (unsigned char)(client->addr << 1); -+ unsigned xor = outer_addr; /* initial xor value */ -+ -+ /* Consistency checks */ -+ if (len > 127) -+ return -EINVAL; -+ -+ /* Special case: sender to 0x6E is always 0x51 */ -+ if (addr == DDCCI_DEFAULT_DEVICE_ADDR) { -+ addr = DDCCI_HOST_ADDR_ODD; -+ } else { -+ /* When sending the odd address is used */ -+ addr = addr | 1; -+ } -+ -+ /* first byte: sender address */ -+ xor ^= addr; -+ ret = i2c_smbus_write_byte(client, addr); -+ if (ret < 0) -+ return ret; -+ -+ /* second byte: protocol flag and message size */ -+ xor ^= ((p_flag << 7) | len); -+ ret = i2c_smbus_write_byte(client, (p_flag << 7)|len); -+ if (ret < 0) -+ return ret; -+ -+ /* send payload */ -+ while (len--) { -+ xor ^= (*buf); -+ ret = i2c_smbus_write_byte(client, (*buf)); -+ if (ret < 0) -+ return ret; -+ buf++; -+ } -+ -+ /* send checksum */ -+ ret = i2c_smbus_write_byte(client, xor); -+ return ret; -+} -+ -+/* Write a message to the DDC/CI bus using i2c_master_send() */ -+static int __ddcci_write_block(struct i2c_client *client, unsigned char addr, -+ unsigned char *sendbuf, bool p_flag, -+ const unsigned char *data, unsigned char len) -+{ -+ unsigned char outer_addr = (unsigned char)(client->addr << 1); -+ unsigned xor = outer_addr; /* initial xor value */ -+ unsigned char *ptr = sendbuf; -+ -+ /* Consistency checks */ -+ if (len > 127) -+ return -EINVAL; -+ -+ /* Special case: sender to 0x6E is always 0x51 */ -+ if (addr == DDCCI_DEFAULT_DEVICE_ADDR) { -+ addr = DDCCI_HOST_ADDR_ODD; -+ } else { -+ /* When sending the odd address is used */ -+ addr = addr | 1; -+ } -+ -+ /* first byte: sender address */ -+ xor ^= addr; -+ *(ptr++) = addr; -+ /* second byte: protocol flag and message size */ -+ xor ^= ((p_flag << 7) | len); -+ *(ptr++) = (p_flag << 7)|len; -+ /* payload */ -+ while (len--) { -+ xor ^= (*data); -+ *(ptr++) = (*data); -+ data++; -+ } -+ /* checksum */ -+ (*ptr) = xor; -+ -+ /* Send it */ -+ return i2c_master_send(client, sendbuf, ptr - sendbuf + 1); -+} -+ -+/* -+ * Write a message to the DDC/CI bus. -+ * -+ * You must hold the bus semaphore when calling this function. -+ */ -+static int ddcci_write(struct i2c_client *client, unsigned char addr, -+ bool p_flag, const unsigned char *data, -+ unsigned char len) -+{ -+ struct ddcci_bus_drv_data *drv_data; -+ unsigned char *sendbuf; -+ int ret; -+ -+ drv_data = i2c_get_clientdata(client); -+ -+ -+ pr_debug("sending to %d:%02x:%02x: %*ph\n", client->adapter->nr, -+ client->addr << 1, addr, len, data); -+ if (drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE) { -+ ret = __ddcci_write_bytewise(client, addr, p_flag, data, len); -+ } else { -+ sendbuf = drv_data->recv_buffer; -+ ret = __ddcci_write_block(client, addr, sendbuf, p_flag, data, len); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Read a response from the DDC/CI bus with headers directly into a buffer. -+ * Always check for DDCCI_QUIRK_SKIP_FIRST_BYTE when using this function. -+ * The returned length contains the whole unmodified response. -+ * If -EMSGSIZE is returned, the buffer contains the response up to `len`. -+ * If any other negative error code is returned, the buffer content is -+ * unspecified. -+ */ -+static int __ddcci_read(struct i2c_client *client, unsigned char addr, -+ bool p_flag, unsigned long quirks, unsigned char *buf, -+ unsigned char len) -+{ -+ int i, payload_len, packet_length, ret; -+ unsigned char xor = DDCCI_HOST_ADDR_EVEN; -+ -+ /* Consistency checks */ -+ if (len < 3) -+ return -EINVAL; -+ -+ /* Read frame */ -+ ret = i2c_master_recv(client, buf, len); -+ if (ret < 0) -+ goto out_err; -+ packet_length = ret; -+ -+ /* Skip first byte if quirk active */ -+ if ((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE) && ret > 0 && len > 0) { -+ ret--; -+ len--; -+ buf++; -+ } -+ -+ /* If answer too short (= incomplete) break out */ -+ if (ret < 3) { -+ ret = -EIO; -+ goto out_err; -+ } -+ -+ /* validate first byte */ -+ if (unlikely(buf[0] != addr)) { -+ ret = (buf[0] == '\0') ? -EAGAIN : -EIO; -+ goto out_err; -+ } -+ -+ /* validate second byte (protocol flag) */ -+ if (unlikely((buf[1] & 0x80) != (p_flag << 7))) { -+ if (!p_flag || !(quirks & DDCCI_QUIRK_NO_PFLAG)) { -+ ret = -EIO; -+ goto out_err; -+ } -+ } -+ -+ /* get and check payload length */ -+ payload_len = buf[1] & 0x7F; -+ if (3+payload_len > packet_length) -+ return -EBADMSG; -+ if (3+payload_len > len) -+ return -EMSGSIZE; -+ -+ /* calculate checksum */ -+ for (i = 0; i < 3+payload_len; i++) -+ xor ^= buf[i]; -+ -+ /* verify checksum */ -+ if (xor != 0) { -+ dev_err(&client->dev, "invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n", -+ xor, payload_len); -+ ret = -EBADMSG; -+ goto out_err; -+ } -+ -+ /* return result */ -+ ret = payload_len+3+((quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)?1:0); -+ -+out_err: -+ return ret; -+} -+ -+/* -+ * Read a response from the DDC/CI bus -+ * -+ * You must hold the bus semaphore when calling this function. -+ */ -+static int ddcci_read(struct i2c_client *client, unsigned char addr, -+ bool p_flag, unsigned char *buf, unsigned char len) -+{ -+ struct ddcci_bus_drv_data *drv_data; -+ unsigned char *recvbuf; -+ int ret; -+ -+ drv_data = i2c_get_clientdata(client); -+ recvbuf = drv_data->recv_buffer; -+ -+ /* Read frame */ -+ ret = __ddcci_read(client, addr, p_flag, -+ drv_data->quirks, recvbuf, DDCCI_RECV_BUFFER_SIZE); -+ if (ret < 0) -+ return ret; -+ -+ if (drv_data->quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE) -+ recvbuf++; -+ -+ /* return result */ -+ if (buf) { -+ if (ret > 3) { -+ ret = ret-3; -+ /* copy to caller buffer */ -+ memcpy(buf, &recvbuf[2], (ret < len) ? ret : len); -+ -+ if (ret > len) { -+ /* if message was truncated, return -EMSGSIZE */ -+ pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph ...\n", -+ client->adapter->nr, client->addr << 1, -+ addr, ret, len, len, buf); -+ ret = -EMSGSIZE; -+ } else { -+ pr_debug("received from %d:%02x:%02x: [%u/%u] %*ph\n", -+ client->adapter->nr, client->addr << 1, -+ addr, ret, len, ret, buf); -+ } -+ } -+ } -+ if (!(drv_data->quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ /* second read to clear buffers, needed on some devices */ -+ __ddcci_read(client, addr, true, drv_data->quirks, recvbuf, 1); -+ } -+ return ret; -+} -+ -+/* Request the capability string for a device and put it into buf */ -+static int ddcci_get_caps(struct i2c_client *client, unsigned char addr, -+ unsigned char *buf, unsigned int len) -+{ -+ int result = 0, counter = 0, offset = 0; -+ unsigned char cmd[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00 }; -+ unsigned char *chunkbuf = kzalloc(35, GFP_KERNEL); -+ -+ if (!chunkbuf) -+ return -ENOMEM; -+ -+ do { -+ /* Send command */ -+ result = ddcci_write(client, addr, true, cmd, sizeof(cmd)); -+ if (result < 0) -+ goto err_free; -+ msleep(delay); -+ /* read result chunk */ -+ result = ddcci_read(client, addr, true, chunkbuf, -+ (len > 32) ? 35 : len+3); -+ if (result < 0) -+ goto err_free; -+ -+ if (result > 0) { -+ /* check chunk header */ -+ if (chunkbuf[0] != DDCCI_REPLY_CAPS) { -+ result = -EIO; -+ goto err_free; -+ } -+ if (chunkbuf[1] != cmd[1] || chunkbuf[2] != cmd[2]) { -+ result = -EIO; -+ goto err_free; -+ } -+ if (result < 3) { -+ result = -EIO; -+ goto err_free; -+ } -+ memcpy(buf, chunkbuf+3, min((unsigned int)result-3, len)); -+ -+ counter++; -+ /* adjust offset, etc. */ -+ offset += result-3; -+ len -= result-3; -+ buf += result-3; -+ cmd[1] = offset >> 8; -+ cmd[2] = offset & 0xFF; -+ /* Another superfluous read to make some devices happy... */ -+ ddcci_read(client, addr, true, NULL, 2); -+ } -+ } while (result > 3 && counter < DDCCI_MAX_CAP_CHUNKS); -+ -+ kfree(chunkbuf); -+ return offset+result-3; -+err_free: -+ kfree(chunkbuf); -+ return result; -+} -+ -+/* -+ * Request the device identification and put it into buf. -+ * -+ * Also detects all communication quirks and sets the corresponding flags -+ * in the ddcci_bus_drv_data structure associated with client. -+ * -+ * The identification command will fail on most DDC devices, as it is optional -+ * to support, but even the "failed" response suffices to detect quirks. -+ */ -+static int ddcci_identify_device(struct i2c_client *client, unsigned char addr, -+ unsigned char *buf, unsigned char len) -+{ -+ int i, payload_len, ret = -ENODEV; -+ unsigned long quirks; -+ unsigned char cmd[1] = { DDCCI_COMMAND_ID }; -+ unsigned char *buffer; -+ unsigned char xor = DDCCI_HOST_ADDR_EVEN; -+ struct ddcci_bus_drv_data *bus_drv_data; -+ -+ bus_drv_data = i2c_get_clientdata(client); -+ quirks = bus_drv_data->quirks; -+ buffer = bus_drv_data->recv_buffer; -+ -+ /* Send Identification command */ -+ if (!(quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ ret = __ddcci_write_block(client, addr, buffer, true, cmd, sizeof(cmd)); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] writing identification command in block mode: %d\n", -+ client->addr << 1, addr, ret); -+ if ((ret == -ENXIO) -+ && i2c_check_functionality(client->adapter, -+ I2C_FUNC_SMBUS_WRITE_BYTE)) { -+ quirks |= DDCCI_QUIRK_WRITE_BYTEWISE; -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: writes must be done bytewise\n"); -+ /* Some devices need writing twice after a failed blockwise write */ -+ __ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd)); -+ msleep(delay); -+ } -+ } -+ if (ret < 0 && (quirks & DDCCI_QUIRK_WRITE_BYTEWISE)) { -+ ret = __ddcci_write_bytewise(client, addr, true, cmd, sizeof(cmd)); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] writing identification command in bytewise mode: %d\n", -+ client->addr << 1, addr, ret); -+ } -+ if (ret < 0) -+ return -ENODEV; -+ -+ /* Wait */ -+ msleep(delay); -+ -+ /* Receive response */ -+ ret = i2c_master_recv(client, buffer, DDCCI_RECV_BUFFER_SIZE); -+ if (ret < 0) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] receiving identification response resulted in errno %d\n", -+ client->addr << 1, addr, ret); -+ return ret; -+ } -+ -+ if (ret == 0) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] no identification response received\n", -+ client->addr << 1, addr); -+ return ret; -+ } -+ -+ /* Skip first byte if quirk already active */ -+ if (quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE && ret > 1) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] doubled first byte quirk in effect\n", -+ client->addr << 1, addr); -+ ret--; -+ buffer++; -+ } -+ -+ /* If answer too short (= incomplete) break out */ -+ if (ret < 3) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response is too short (%d bytes)\n", -+ client->addr << 1, addr, ret); -+ return -EIO; -+ } -+ -+ /* validate first byte */ -+ if (buffer[0] != addr) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph\n", -+ client->addr << 1, addr, (ret > 32 ? 32 : ret), buffer); -+ -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response invalid (expected first byte %02x, got %02x)\n", -+ client->addr << 1, addr, addr, buffer[0]); -+ return -ENODEV; -+ } -+ -+ /* Check if first byte is doubled (QUIRK_SKIP_FIRST_BYTE) */ -+ if (!(quirks & DDCCI_QUIRK_SKIP_FIRST_BYTE)) { -+ if (buffer[0] == buffer[1]) { -+ quirks |= DDCCI_QUIRK_SKIP_FIRST_BYTE; -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: doubled first byte on read\n"); -+ ret--; -+ buffer++; -+ if (ret < 3) -+ return -EIO; -+ } -+ } -+ -+ /* validate second byte (protocol flag) */ -+ if ((buffer[1] & 0x80) != 0x80 && !(quirks & DDCCI_QUIRK_NO_PFLAG)) { -+ dev_info(&client->dev, -+ "DDC/CI bus quirk detected: device omits protocol flag on responses\n"); -+ quirks |= DDCCI_QUIRK_NO_PFLAG; -+ } -+ -+ /* get and check payload length */ -+ payload_len = buffer[1] & 0x7F; -+ if (3+payload_len > ret) { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph ...\n", -+ client->addr << 1, addr, ret, buffer); -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response was truncated (expected %d bytes, got %d)\n", -+ client->addr << 1, addr, 3+payload_len, ret); -+ return -EBADMSG; -+ } -+ -+ dev_dbg(&client->dev, -+ "[%02x:%02x] identification response: %*ph\n", -+ client->addr << 1, addr, 3+payload_len, buffer); -+ -+ /* calculate checksum */ -+ for (i = 0; i < 3+payload_len; i++) -+ xor ^= buffer[i]; -+ -+ /* verify checksum */ -+ if (xor != 0) { -+ dev_err(&client->dev, -+ "[%02x:%02x] invalid DDC/CI response, corrupted data - xor is 0x%02x, length 0x%02x\n", -+ client->addr << 1, addr, xor, payload_len); -+ return -EBADMSG; -+ } -+ -+ /* save quirks */ -+ bus_drv_data->quirks = quirks; -+ -+ /* return result */ -+ if (payload_len <= len) { -+ ret = payload_len; -+ memcpy(buf, &buffer[2], payload_len); -+ } else { -+ ret = -EMSGSIZE; -+ memcpy(buf, &buffer[2], len); -+ } -+ return ret; -+} -+ -+/* Character device */ -+ -+/* Data structure for an open file handle */ -+struct ddcci_fp_data { -+ struct ddcci_device *dev; -+ bool exclusive; -+ unsigned char buffer[129]; -+}; -+ -+/* Called when the character device is opened */ -+static int ddcci_cdev_open(struct inode *inode, struct file *filp) -+{ -+ struct ddcci_device *dev = container_of(inode->i_cdev, -+ struct ddcci_device, cdev); -+ struct ddcci_fp_data *fp_data = NULL; -+ -+ fp_data = kzalloc(sizeof(struct ddcci_fp_data), GFP_KERNEL); -+ -+ if (!fp_data) -+ return -ENOMEM; -+ -+ fp_data->exclusive = filp->f_flags & O_EXCL; -+ -+ if (fp_data->exclusive) { -+ if (down_write_trylock(&dev->cdev_sem) == 0) { -+ kfree(fp_data); -+ return -EBUSY; -+ } -+ } else { -+ if (down_read_trylock(&dev->cdev_sem) == 0) { -+ kfree(fp_data); -+ return -EBUSY; -+ } -+ } -+ -+ fp_data->dev = dev; -+ filp->private_data = fp_data; -+ -+ return 0; -+} -+ -+/* Called when the character device is closed */ -+static int ddcci_cdev_close(struct inode *inode, struct file *filp) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ -+ if (fp_data->exclusive) -+ up_write(&dev->cdev_sem); -+ else -+ up_read(&dev->cdev_sem); -+ -+ filp->private_data = NULL; -+ kfree(fp_data); -+ return 0; -+} -+ -+/* Called when reading from the character device */ -+static ssize_t ddcci_cdev_read(struct file *filp, char __user *buffer, -+ size_t length, loff_t *offset) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ unsigned char *buf = fp_data->buffer; -+ const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0; -+ int ret = 0; -+ -+ if ((filp->f_mode & FMODE_READ) == 0) -+ return -EBADF; -+ -+ /* Lock mutex */ -+ if (nonblocking) { -+ if (down_trylock(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ } else { -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -ERESTARTSYS; -+ } -+ -+ /* Execute read */ -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, true, buf, -+ length); -+ -+ if (ret > 0) { -+ /* Copy data from user space */ -+ if (copy_to_user(buffer, buf, ret)) { -+ ret = -EFAULT; -+ goto out; -+ } -+ } -+ -+out: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+ -+/* Called when writing to the character device */ -+static ssize_t ddcci_cdev_write(struct file *filp, const char __user *buffer, -+ size_t count, loff_t *offset) -+{ -+ struct ddcci_fp_data *fp_data = filp->private_data; -+ struct ddcci_device *dev = fp_data->dev; -+ unsigned char *buf = fp_data->buffer; -+ const bool nonblocking = (filp->f_flags & O_NONBLOCK) != 0; -+ int ret = 0; -+ -+ if ((filp->f_mode & FMODE_WRITE) == 0) -+ return -EBADF; -+ -+ if (count > 127) -+ return -EINVAL; -+ -+ /* Lock mutex */ -+ if (nonblocking) { -+ if (down_trylock(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ } else { -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -ERESTARTSYS; -+ } -+ -+ if (count > 0) { -+ /* Copy data from user space */ -+ if (copy_from_user(buf, buffer, count)) { -+ ret = -EFAULT; -+ goto err_out; -+ } -+ -+ /* Execute write */ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, -+ true, buf, count); -+ } -+ -+ if (ret >= 0) { -+ msleep(delay); -+ up(&dev->bus_drv_data->sem); -+ return count; -+ } -+ -+err_out: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+ -+/* Called when seeking the character device */ -+static loff_t ddcci_cdev_seek(struct file *filp, loff_t offset, int anchor) -+{ -+ return -EINVAL; -+} -+ -+static const struct file_operations ddcci_fops = { -+ .owner = THIS_MODULE, -+ .read = ddcci_cdev_read, -+ .write = ddcci_cdev_write, -+ .open = ddcci_cdev_open, -+ .release = ddcci_cdev_close, -+ .llseek = ddcci_cdev_seek -+}; -+ -+/* Set up the character device for a DDC/CI device */ -+static int ddcci_setup_char_device(struct ddcci_device *device) -+{ -+ int ret = -EINVAL; -+ -+ /* Check if free minor exists */ -+ if (ddcci_cdev_next == ddcci_cdev_end) { -+ dev_err(&device->dev, "no free major/minor\n"); -+ ret = -ENFILE; -+ goto out; -+ } -+ -+ /* Initialize rwsem */ -+ init_rwsem(&device->cdev_sem); -+ -+ /* Initialize character device node */ -+ cdev_init(&device->cdev, &ddcci_fops); -+ device->cdev.owner = THIS_MODULE; -+ -+ /* Publish char device */ -+ device->dev.devt = ddcci_cdev_next; -+ ret = cdev_add(&device->cdev, ddcci_cdev_next, 1); -+ if (ret) { -+ device->dev.devt = 0; -+ goto out; -+ } -+ -+ ddcci_cdev_next++; -+out: -+ return ret; -+} -+ -+/* sysfs attributes */ -+ -+static ssize_t ddcci_attr_capabilities_show(struct device *dev, -+ struct device_attribute *attr, -+ char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = device->capabilities_len; -+ if (unlikely(len > PAGE_SIZE)) -+ len = PAGE_SIZE; -+ if (len == 0) { -+ ret = len; -+ } else { -+ memcpy(buf, device->capabilities, len); -+ if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } -+ } -+ } -+ -+ return ret; -+} -+ -+static ssize_t ddcci_attr_prot_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->prot, sizeof(device->prot)); -+ strncpy(buf, device->prot, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_type_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->type, sizeof(device->type)); -+ strncpy(buf, device->type, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_model_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->model, sizeof(device->model)); -+ strncpy(buf, device->model, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_vendor_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->vendor, sizeof(device->vendor)); -+ strncpy(buf, device->vendor, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_module_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ size_t len; -+ -+ if (likely(device != NULL)) { -+ len = strnlen(device->module, sizeof(device->module)); -+ strncpy(buf, device->module, PAGE_SIZE); -+ if (len == 0) { -+ ret = len; -+ } else if (likely(len < PAGE_SIZE)) { -+ buf[len] = '\n'; -+ ret = len+1; -+ } else { -+ ret = PAGE_SIZE; -+ } -+ } -+ return ret; -+} -+ -+static ssize_t ddcci_attr_serial_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ -+ if (likely(device != NULL)) -+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", device->device_number); -+ -+ return ret; -+} -+ -+static ssize_t ddcci_attr_modalias_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ ssize_t ret = -ENOENT; -+ char model[ARRAY_SIZE(device->model)]; -+ char vendor[ARRAY_SIZE(device->model)]; -+ char module[ARRAY_SIZE(device->model)]; -+ -+ if (likely(device != NULL)) { -+ memcpy(model, device->model, sizeof(model)); -+ memcpy(vendor, device->vendor, sizeof(vendor)); -+ memcpy(module, device->module, sizeof(module)); -+ ddcci_modalias_clean(model, sizeof(model), '_'); -+ ddcci_modalias_clean(vendor, sizeof(vendor), '_'); -+ ddcci_modalias_clean(module, sizeof(module), '_'); -+ -+ ret = scnprintf(buf, PAGE_SIZE, "%s%s-%s-%s-%s-%s\n", -+ DDCCI_MODULE_PREFIX, -+ device->prot, -+ device->type, -+ model, -+ vendor, -+ module -+ ); -+ } -+ return ret; -+} -+ -+static DEVICE_ATTR(capabilities, S_IRUGO, ddcci_attr_capabilities_show, NULL); -+static DEVICE_ATTR(idProt, S_IRUGO, ddcci_attr_prot_show, NULL); -+static DEVICE_ATTR(idType, S_IRUGO, ddcci_attr_type_show, NULL); -+static DEVICE_ATTR(idModel, S_IRUGO, ddcci_attr_model_show, NULL); -+static DEVICE_ATTR(idVendor, S_IRUGO, ddcci_attr_vendor_show, NULL); -+static DEVICE_ATTR(idModule, S_IRUGO, ddcci_attr_module_show, NULL); -+static DEVICE_ATTR(idSerial, S_IRUGO, ddcci_attr_serial_show, NULL); -+static DEVICE_ATTR(modalias, S_IRUGO, ddcci_attr_modalias_show, NULL); -+ -+static struct attribute *ddcci_char_device_attrs[] = { -+ &dev_attr_capabilities.attr, -+ &dev_attr_idProt.attr, -+ &dev_attr_idType.attr, -+ &dev_attr_idModel.attr, -+ &dev_attr_idVendor.attr, -+ &dev_attr_idModule.attr, -+ &dev_attr_idSerial.attr, -+ &dev_attr_modalias.attr, -+ NULL, -+}; -+ATTRIBUTE_GROUPS(ddcci_char_device); -+ -+/* DDC/CI bus */ -+ -+static int ddcci_device_uevent(const struct device *dev, struct kobj_uevent_env *env) -+{ -+ struct ddcci_device *device = to_ddcci_device(dev); -+ char model[ARRAY_SIZE(device->model)]; -+ char vendor[ARRAY_SIZE(device->vendor)]; -+ char module[ARRAY_SIZE(device->module)]; -+ -+ memcpy(model, device->model, sizeof(model)); -+ memcpy(vendor, device->vendor, sizeof(vendor)); -+ memcpy(module, device->module, sizeof(module)); -+ ddcci_modalias_clean(model, sizeof(model), '_'); -+ ddcci_modalias_clean(vendor, sizeof(vendor), '_'); -+ ddcci_modalias_clean(module, sizeof(module), '_'); -+ -+ if (add_uevent_var(env, "MODALIAS=%s%s-%s-%s-%s-%s", -+ DDCCI_MODULE_PREFIX, -+ device->prot, -+ device->type, -+ model, -+ vendor, -+ module -+ )) -+ return -ENOMEM; -+ -+ if (device->prot[0]) -+ if (add_uevent_var(env, "DDCCI_PROT=%s", device->prot)) -+ return -ENOMEM; -+ -+ if (device->type[0]) -+ if (add_uevent_var(env, "DDCCI_TYPE=%s", device->type)) -+ return -ENOMEM; -+ -+ if (device->model[0]) -+ if (add_uevent_var(env, "DDCCI_MODEL=%s", device->model)) -+ return -ENOMEM; -+ -+ if (device->vendor[0]) { -+ if (add_uevent_var(env, "DDCCI_VENDOR=%s", device->vendor)) -+ return -ENOMEM; -+ -+ if (add_uevent_var(env, "DDCCI_MODULE=%s", device->module)) -+ return -ENOMEM; -+ -+ if (add_uevent_var(env, "DDCCI_UNIQ=%d", device->device_number)) -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static void ddcci_device_release(struct device *dev) -+{ -+ struct ddcci_device *device = to_ddcci_device(dev); -+ struct ddcci_driver *driver; -+ -+ /* Notify driver */ -+ if (dev->driver) { -+ driver = to_ddcci_driver(dev->driver); -+ if (driver->remove) -+ driver->remove(device); -+ } -+ -+ /* Teardown chardev */ -+ if (dev->devt) { -+ down(&core_lock); -+ if (device->cdev.dev == ddcci_cdev_next-1) -+ ddcci_cdev_next--; -+ cdev_del(&device->cdev); -+ up(&core_lock); -+ } -+ -+ /* Free capability string */ -+ if (device->capabilities) { -+ device->capabilities_len = 0; -+ kfree(device->capabilities); -+ } -+ /* Free device */ -+ kfree(device); -+} -+ -+static char *ddcci_devnode(const struct device *dev, -+ umode_t *mode, kuid_t *uid, kgid_t *gid) -+{ -+ struct ddcci_device *device; -+ -+ device = to_ddcci_device(dev); -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/display", -+ device->i2c_client->adapter->nr); -+} -+ -+static char *ddcci_dependent_devnode(const struct device *dev, -+ umode_t *mode, kuid_t *uid, kgid_t *gid) -+{ -+ struct ddcci_device *device; -+ -+ device = to_ddcci_device(dev); -+ if (device->flags & DDCCI_FLAG_EXTERNAL) { -+ if (device->outer_addr == device->inner_addr) -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x", -+ device->i2c_client->adapter->nr, -+ device->outer_addr); -+ else -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/e%02x%02x", -+ device->i2c_client->adapter->nr, -+ device->outer_addr, device->inner_addr); -+ } else { -+ return kasprintf(GFP_KERNEL, "bus/ddcci/%d/i%02x", -+ device->i2c_client->adapter->nr, -+ device->inner_addr); -+ } -+} -+ -+/* Device type for main DDC/CI devices*/ -+static struct device_type ddcci_device_type = { -+ .name = "ddcci-device", -+ .uevent = ddcci_device_uevent, -+ .groups = ddcci_char_device_groups, -+ .release = ddcci_device_release, -+ .devnode = ddcci_devnode -+}; -+ -+/* Device type for dependent DDC/CI devices*/ -+static struct device_type ddcci_dependent_type = { -+ .name = "ddcci-dependent-device", -+ .uevent = ddcci_device_uevent, -+ .groups = ddcci_char_device_groups, -+ .release = ddcci_device_release, -+ .devnode = ddcci_dependent_devnode -+}; -+ -+/** -+ * ddcci_verify_device - return parameter as ddcci_device, or NULL -+ * @dev: device, probably from some driver model iterator -+ */ -+struct ddcci_device *ddcci_verify_device(struct device *dev) -+{ -+ if (unlikely(!dev)) -+ return NULL; -+ return (dev->type == &ddcci_device_type -+ || dev->type == &ddcci_dependent_type) -+ ? to_ddcci_device(dev) -+ : NULL; -+} -+EXPORT_SYMBOL(ddcci_verify_device); -+ -+/** -+ * ddcci_quirks - Get quirks for DDC/CI device -+ * @dev: Target DDC/CI device -+ */ -+unsigned long ddcci_quirks(struct ddcci_device *dev) -+{ -+ if (unlikely(WARN_ON(!dev))) -+ return ~0L; -+ if (unlikely(WARN_ON(!dev->bus_drv_data))) -+ return ~0L; -+ return dev->bus_drv_data->quirks; -+} -+EXPORT_SYMBOL(ddcci_quirks); -+ -+/** -+ * ddcci_register_driver - register DDC/CI driver -+ * @owner: the owning module -+ * @driver: the driver to register -+ */ -+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver) -+{ -+ int ret; -+ -+ /* Can't register until after driver model init */ -+ if (unlikely(WARN_ON(!ddcci_bus_registered))) -+ return -EAGAIN; -+ -+ pr_debug("registering driver [%s]\n", driver->driver.name); -+ -+ /* add the driver to the list of ddcci drivers in the driver core */ -+ driver->driver.owner = owner; -+ driver->driver.bus = &ddcci_bus_type; -+ -+ /* When registration returns, the driver core -+ * will have called probe() for all matching-but-unbound devices. -+ */ -+ ret = driver_register(&driver->driver); -+ if (ret) -+ return ret; -+ -+ pr_debug("driver [%s] registered\n", driver->driver.name); -+ -+ return 0; -+} -+EXPORT_SYMBOL(ddcci_register_driver); -+ -+/** -+ * ddcci_del_driver - unregister DDC/CI driver -+ * @driver: the driver being unregistered -+ */ -+void ddcci_del_driver(struct ddcci_driver *driver) -+{ -+ driver_unregister(&driver->driver); -+ pr_debug("driver [%s] unregistered\n", driver->driver.name); -+} -+EXPORT_SYMBOL(ddcci_del_driver); -+ -+/** -+ * ddcci_device_write - Write a message to a DDC/CI device -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, true for standard control messages -+ * @data: Data that will be written to the device -+ * @length: How many bytes to write -+ * -+ * Writes the message to the device and sleeps (see module parameter 'delay') -+ */ -+int ddcci_device_write(struct ddcci_device *dev, bool p_flag, -+ unsigned char *data, unsigned char length) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, data, length); -+ msleep(delay); -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_write); -+ -+/** -+ * ddcci_device_read - Read a response from a DDC/CI device -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, must match the corresponding write -+ * @buffer: Where to store data read from the device -+ * @length: Buffer size -+ */ -+int ddcci_device_read(struct ddcci_device *dev, bool p_flag, -+ unsigned char *buffer, unsigned char length) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length); -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_read); -+ -+/** -+ * ddcci_device_writeread - Write a message to a device and read the response -+ * @dev: Target DDC/CI device -+ * @p_flag: Protocol flag, true for standard control messages -+ * @buffer: Buffer used for write and read -+ * @length: How many bytes to write -+ * @maxlength: Buffer size on read -+ * -+ * Writing, sleeping and reading are done without releasing the DDC/CI bus. -+ * This provides atomicity in respect to other DDC/CI accesses on the same bus. -+ */ -+int ddcci_device_writeread(struct ddcci_device *dev, bool p_flag, -+ unsigned char *buffer, unsigned char length, -+ unsigned char maxlength) -+{ -+ int ret; -+ -+ if (down_interruptible(&dev->bus_drv_data->sem)) -+ return -EAGAIN; -+ -+ ret = ddcci_write(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, length); -+ if (ret < 0) -+ goto err; -+ msleep(delay); -+ ret = ddcci_read(dev->bus_drv_data->i2c_dev, dev->inner_addr, p_flag, buffer, maxlength); -+err: -+ up(&dev->bus_drv_data->sem); -+ return ret; -+} -+EXPORT_SYMBOL(ddcci_device_writeread); -+ -+#define IS_ANY_ID(x) (((x)[0] == '\xFF') && ((x)[7] == '\xFF')) -+ -+/* Check if any device id in the array matches the device and return the matching id */ -+static const struct ddcci_device_id *ddcci_match_id(const struct ddcci_device_id *id, -+ const struct ddcci_device *device) -+{ -+ while (id->prot[0] || id->type[0] || id->model[0] || id->vendor[0] || id->module[0]) { -+ if ((IS_ANY_ID(id->prot) || (strcmp(device->prot, id->prot) == 0)) -+ && (IS_ANY_ID(id->type) || (strcmp(device->type, id->type) == 0)) -+ && (IS_ANY_ID(id->model) || (strcmp(device->model, id->model) == 0)) -+ && (IS_ANY_ID(id->vendor) || (strcmp(device->vendor, id->vendor) == 0)) -+ && (IS_ANY_ID(id->module) || (strcmp(device->module, id->module) == 0))) { -+ return id; -+ } -+ id++; -+ } -+ return NULL; -+} -+ -+static int ddcci_device_match(struct device *dev, struct device_driver *drv) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ -+ if (!device) -+ return 0; -+ -+ driver = to_ddcci_driver(drv); -+ /* match on an id table if there is one */ -+ if (driver->id_table) -+ return ddcci_match_id(driver->id_table, device) != NULL; -+ -+ return 0; -+} -+ -+static int ddcci_device_probe(struct device *dev) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ const struct ddcci_device_id *id; -+ int ret = 0; -+ -+ if (!device) -+ return -EINVAL; -+ driver = to_ddcci_driver(dev->driver); -+ -+ id = ddcci_match_id(driver->id_table, device); -+ if (!id) -+ return -ENODEV; -+ -+ if (driver->probe) -+ ret = driver->probe(device, id); -+ -+ return ret; -+} -+ -+static int ddcci_device_remove(struct device *dev) -+{ -+ struct ddcci_device *device = ddcci_verify_device(dev); -+ struct ddcci_driver *driver; -+ int ret = 0; -+ -+ if (!device) -+ return -EINVAL; -+ driver = to_ddcci_driver(dev->driver); -+ -+ if (driver->remove) -+ ret = driver->remove(device); -+ -+ return ret; -+} -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+static void ddcci_device_remove_void(struct device *dev) -+{ -+ ddcci_device_remove(dev); -+} -+#endif -+ -+/** -+ * DDCCI bus type structure -+ */ -+struct bus_type ddcci_bus_type = { -+ .name = "ddcci", -+ .match = ddcci_device_match, -+ .probe = ddcci_device_probe, -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+ .remove = ddcci_device_remove_void -+#else -+ .remove = ddcci_device_remove -+#endif -+}; -+ -+/* Main I2C driver */ -+ -+/* Get a pointer to the closing parenthesis */ -+static char *ddcci_capstr_tok(const char *s, int depth) -+{ -+ const char *ptr = s; -+ char *end; -+ -+ if (s == NULL || s[0] == '\0') -+ return NULL; -+ -+ while ((end = strpbrk(ptr, "()"))) { -+ if (!end || depth == INT_MAX) -+ return NULL; -+ if (*end == '(') -+ depth++; -+ else if (depth > 0) -+ depth--; -+ else -+ break; -+ ptr = end+1; -+ } -+ return end; -+} -+ -+/** -+ * ddcci_find_capstr_item - Search capability string for a tag -+ * @capabilities: Capability string to search -+ * @tag: Tag to find -+ * @length: Buffer for the length of the found tag value (optional) -+ * -+ * Return a pointer to the start of the tag value (directly after the '(') on -+ * success and write the length of the value (excluding the ')') into `length`. -+ * -+ * If the tag is not found or another error occurs, an ERR_PTR is returned -+ * and `length` stays untouched. -+ */ -+const char *ddcci_find_capstr_item(const char * capabilities, -+ const char * __restrict tag, -+ size_t *length) -+{ -+ const char *src = capabilities, *ptr; -+ ptrdiff_t len; -+ int taglen = strnlen(tag, 1024); -+ -+ /* Check length of requested tag */ -+ if (unlikely(taglen <= 0 || taglen > 1023)) -+ return ERR_PTR(-EINVAL); -+ -+ /* Find tag */ -+ while (src && (strncmp(src+1, tag, taglen) != 0 || src[1+taglen] != '(')) -+ src = ddcci_capstr_tok(src+1, -1); -+ if (!src || src[0] == '\0') -+ return ERR_PTR(-ENOENT); -+ -+ /* Locate end of value */ -+ src += taglen+2; -+ ptr = ddcci_capstr_tok(src, 0); -+ if (unlikely(!ptr)) -+ return ERR_PTR(-EOVERFLOW); -+ -+ /* Check length of tag data */ -+ len = ptr-src; -+ if (unlikely(len < 0 || len > 65535)) -+ return ERR_PTR(-EMSGSIZE); -+ -+ /* Return pointer and length */ -+ if (likely(length != NULL)) -+ *length = (size_t)len; -+ return src; -+} -+EXPORT_SYMBOL(ddcci_find_capstr_item); -+ -+/* Search the capability string for a tag and copy the value to dest */ -+static int ddcci_cpy_capstr_item(char *dest, const char *src, -+ const char * __restrict tag, size_t maxlen) -+{ -+ const char *ptr; -+ size_t len; -+ -+ /* Find tag */ -+ ptr = ddcci_find_capstr_item(src, tag, &len); -+ if (IS_ERR(ptr)) { -+ return PTR_ERR(ptr); -+ } -+ -+ /* Copy value */ -+ memcpy(dest, ptr, min(len, maxlen)); -+ return 0; -+} -+ -+/* Fill fields in device by parsing the capability string */ -+static int ddcci_parse_capstring(struct ddcci_device *device) -+{ -+ const char *capstr = device->capabilities; -+ int ret = 0; -+ -+ if (!capstr) -+ return -EINVAL; -+ -+ /* capability string start with a paren */ -+ if (capstr[0] != '(') -+ return -EINVAL; -+ -+ /* get prot(...) */ -+ ret = ddcci_cpy_capstr_item(device->prot, capstr, "prot", sizeof(device->prot)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no protocol tag"); -+ memset(device->prot, 0, sizeof(device->prot)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* get type(...) */ -+ ret = ddcci_cpy_capstr_item(device->type, capstr, "type", sizeof(device->type)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no type tag"); -+ memset(device->type, 0, sizeof(device->type)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* and then model(...) */ -+ ret = ddcci_cpy_capstr_item(device->model, capstr, "model", sizeof(device->model)-1); -+ if (ret) { -+ if (ret == -ENOENT) { -+ dev_warn(&device->dev, "malformed capability string: no model tag"); -+ memset(device->model, 0, sizeof(device->model)-1); -+ } else { -+ return ret; -+ } -+ } -+ -+ /* if there is no protocol tag */ -+ if (!device->prot[0]) { -+ /* and no type tag: give up. */ -+ if (!device->type[0]) -+ return -ENOENT; -+ -+ /* Assume protocol "monitor" if type is "LCD" or "CRT" */ -+ if (strncasecmp(device->type, "LCD", sizeof(device->type)-1) == 0 -+ || strncasecmp(device->type, "CRT", sizeof(device->type)-1) == 0) { -+ memcpy(device->prot, "monitor", 7); -+ } -+ } -+ -+ /* skip the rest for now */ -+ -+ return 0; -+} -+ -+/* Probe for a device on an inner address and create a ddcci_device for it */ -+static int ddcci_detect_device(struct i2c_client *client, unsigned char addr, -+ int dependent) -+{ -+ int ret; -+ unsigned char outer_addr = client->addr << 1; -+ unsigned char *buffer = NULL; -+ struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client); -+ struct ddcci_device *device = NULL; -+ -+ down(&drv_data->sem); -+ -+ /* Allocate buffer big enough for any capability string */ -+ buffer = kmalloc(16384, GFP_KERNEL); -+ if (!buffer) { -+ ret = -ENOMEM; -+ goto err_end; -+ } -+ -+ /* Allocate device struct */ -+ device = kzalloc(sizeof(struct ddcci_device), GFP_KERNEL); -+ if (!device) { -+ ret = -ENOMEM; -+ goto err_end; -+ } -+ -+ /* Initialize device */ -+ device_initialize(&device->dev); -+ device->dev.parent = &client->dev; -+ device->dev.bus = &ddcci_bus_type; -+ device->outer_addr = outer_addr; -+ device->inner_addr = addr; -+ device->bus_drv_data = drv_data; -+ device->i2c_client = client; -+ -+ if (!dependent) { -+ device->dev.type = &ddcci_device_type; -+ ret = dev_set_name(&device->dev, "ddcci%d", client->adapter->nr); -+ } else if (outer_addr == dependent) { -+ /* Internal dependent device */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT; -+ ret = dev_set_name(&device->dev, "ddcci%di%02x", client->adapter->nr, addr); -+ } else if (outer_addr == addr) { -+ /* External dependent device */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL; -+ ret = dev_set_name(&device->dev, "ddcci%de%02x", client->adapter->nr, addr); -+ } else { -+ /* Dependent device of external dependent device -+ Just in case something like this exists */ -+ device->dev.type = &ddcci_dependent_type; -+ device->flags = DDCCI_FLAG_DEPENDENT | DDCCI_FLAG_EXTERNAL; -+ ret = dev_set_name(&device->dev, "ddcci%de%02x%02x", client->adapter->nr, outer_addr, addr); -+ } -+ -+ if (ret) -+ goto err_free; -+ -+ /* Read identification and check for quirks */ -+ ret = ddcci_identify_device(client, addr, buffer, 29); -+ if (ret < 0) { -+ if (!dependent && (ret == -EBADMSG || ret == -EMSGSIZE)) { -+ dev_warn(&device->dev, "DDC/CI main device sent broken response on identification. Trying to detect solely based on capability information.\n"); -+ } else { -+ goto err_free; -+ } -+ } -+ -+ if (ret == 29 && buffer[0] == DDCCI_REPLY_ID) { -+ memcpy(device->vendor, &buffer[7], 8); -+ memcpy(device->module, &buffer[17], 8); -+ device->device_number = be32_to_cpu(*(__force __be32 *)&buffer[18]); -+ } -+ -+ /* Read capabilities */ -+ ret = ddcci_get_caps(client, addr, buffer, 16384); -+ if (ret > 0) { -+ /* Fixup unparenthesized capability strings, but only if the first -+ character is an ascii lower case letter. -+ This should still allow an early exit for completely garbled -+ data but helps detecting devices where only the parentheses are -+ missing, as the second char must be the first character of a -+ keyword. */ -+ if (ret > 2 && buffer[0] >= 'a' && buffer[0] <= 'z') { -+ dev_err(&device->dev, "DDC/CI device quirk detected: unparenthesized capability string\n"); -+ device->capabilities = kzalloc(ret+3, GFP_KERNEL); -+ if (!device->capabilities) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ device->capabilities_len = ret+2; -+ memcpy(&(device->capabilities[1]), buffer, ret); -+ device->capabilities[0] = '('; -+ device->capabilities[ret+1] = ')'; -+ } else { -+ /* Standard case: simply copy the received string */ -+ device->capabilities = kzalloc(ret+1, GFP_KERNEL); -+ if (!device->capabilities) { -+ ret = -ENOMEM; -+ goto err_free; -+ } -+ device->capabilities_len = ret; -+ memcpy(device->capabilities, buffer, ret); -+ } -+ -+ ret = ddcci_parse_capstring(device); -+ if (ret) { -+ dev_err(&device->dev, "malformed capability string: \"%s\" errno %d\n", device->capabilities, ret); -+ ret = -EINVAL; -+ goto err_free; -+ } -+ } -+ -+ /* Found a device if either identification or capabilities succeeded */ -+ if (!device->capabilities && device->vendor[0] == '\0') { -+ dev_dbg(&client->dev, -+ "[%02x:%02x] got neither valid identification nor capability data\n", -+ client->addr << 1, addr); -+ ret = -ENODEV; -+ goto err_free; -+ } -+ -+ /* Setup chardev */ -+ down(&core_lock); -+ ret = ddcci_setup_char_device(device); -+ up(&core_lock); -+ if (ret) -+ goto err_free; -+ -+ /* Release semaphore and add device to the tree */ -+ up(&drv_data->sem); -+ pr_debug("found device at %d:%02x:%02x\n", client->adapter->nr, outer_addr, addr); -+ ret = device_add(&device->dev); -+ if (ret) -+ goto err_free; -+ -+ goto end; -+err_free: -+ put_device(&device->dev); -+err_end: -+ up(&drv_data->sem); -+end: -+ kfree(buffer); -+ return ret; -+} -+ -+/* I2C detect function: check if a main or external dependent device exists */ -+static int ddcci_detect(struct i2c_client *client, struct i2c_board_info *info) -+{ -+ int ret; -+ unsigned char outer_addr; -+ unsigned char inner_addr; -+ unsigned char buf[32]; -+ unsigned char cmd_id[1] = { DDCCI_COMMAND_ID }; -+ unsigned char cmd_caps[3] = { DDCCI_COMMAND_CAPS, 0x00, 0x00}; -+ unsigned char *cmd; -+ unsigned int cmd_len; -+ -+ /* Check for i2c_master_* functionality */ -+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_I2C)) { -+ pr_debug("i2c adapter %d unsuitable: no i2c_master functionality\n", client->adapter->nr); -+ return -ENODEV; -+ } -+ -+ /* send Capabilities Request (for main) or Identification Request command (for dependent devices) */ -+ outer_addr = client->addr << 1; -+ inner_addr = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? DDCCI_HOST_ADDR_ODD : outer_addr | 1; -+ cmd = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? cmd_caps : cmd_id; -+ cmd_len = (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? sizeof(cmd_caps) : sizeof(cmd_id); -+ pr_debug("detecting %d:%02x\n", client->adapter->nr, outer_addr); -+ -+ ret = __ddcci_write_block(client, inner_addr, buf, true, cmd, cmd_len); -+ -+ if (ret == -ENXIO || ret == -EIO) { -+ if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_WRITE_BYTE)) { -+ pr_debug("i2c write failed with ENXIO or EIO but bytewise writing is not supported\n"); -+ return -ENODEV; -+ } -+ pr_debug("i2c write failed with ENXIO or EIO, trying bytewise writing\n"); -+ ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len); -+ if (ret == 0) { -+ msleep(delay); -+ ret = __ddcci_write_bytewise(client, inner_addr, true, cmd, cmd_len); -+ } -+ } -+ -+ if (ret < 0) -+ return -ENODEV; -+ -+ /* wait for device */ -+ msleep(delay); -+ /* receive answer */ -+ ret = i2c_master_recv(client, buf, 32); -+ if (ret < 3) { -+ pr_debug("detection failed: no answer\n"); -+ return -ENODEV; -+ } -+ -+ /* check response starts with outer addr */ -+ if (buf[0] != outer_addr) { -+ pr_debug("detection failed: invalid %s response (%02x != %02x)\n", (cmd == cmd_id) ? "identification" : "capabilities", buf[0], outer_addr); -+ pr_debug("received message was %*ph \n", ret, buf); -+ return -ENODEV; -+ } -+ -+ pr_debug("detected %d:%02x\n", client->adapter->nr, outer_addr); -+ -+ /* set device type */ -+ strlcpy(info->type, (outer_addr == DDCCI_DEFAULT_DEVICE_ADDR) ? "ddcci" : "ddcci-dependent", I2C_NAME_SIZE); -+ -+ return 0; -+} -+ -+/* I2C probe function */ -+static int ddcci_probe(struct i2c_client *client) -+{ -+ const struct i2c_device_id *id = i2c_client_get_device_id(client); -+ int i, ret = -ENODEV, tmp; -+ unsigned char main_addr, addr; -+ struct ddcci_bus_drv_data *drv_data; -+ -+ /* Initialize driver data structure */ -+ drv_data = devm_kzalloc(&client->dev, sizeof(struct ddcci_bus_drv_data), GFP_KERNEL); -+ if (!drv_data) -+ return -ENOMEM; -+ drv_data->i2c_dev = client; -+ sema_init(&drv_data->sem, 1); -+ -+ /* Set i2c client data */ -+ i2c_set_clientdata(client, drv_data); -+ -+ if (id->driver_data == 0) { -+ /* Core device, probe at 0x6E */ -+ main_addr = DDCCI_DEFAULT_DEVICE_ADDR; -+ dev_dbg(&client->dev, "probing core device [%02x]\n", -+ client->addr << 1); -+ ret = ddcci_detect_device(client, main_addr, 0); -+ if (ret) { -+ dev_info(&client->dev, "core device [%02x] probe failed: %d\n", -+ client->addr << 1, ret); -+ if (ret == -EIO) -+ ret = -ENODEV; -+ goto err_free; -+ } -+ -+ /* Detect internal dependent devices */ -+ dev_dbg(&client->dev, "probing internal dependent devices\n"); -+ for (i = 0; i < autoprobe_addr_count; ++i) { -+ addr = (unsigned short)autoprobe_addrs[i]; -+ if ((addr & 1) == 0 && addr != main_addr) { -+ tmp = ddcci_detect_device(client, addr, main_addr); -+ if (tmp < 0 && tmp != -ENODEV) { -+ dev_info(&client->dev, "internal dependent device [%02x:%02x] probe failed: %d\n", -+ client->addr << 1, addr, ret); -+ } -+ } -+ } -+ } else if (id->driver_data == 1) { -+ /* External dependent device */ -+ main_addr = client->addr << 1; -+ dev_dbg(&client->dev, "probing external dependent device [%02x]\n", main_addr); -+ ret = ddcci_detect_device(client, main_addr, -1); -+ if (ret) { -+ dev_info(&client->dev, "external dependent device [%02x] probe failed: %d\n", -+ main_addr, ret); -+ if (ret == -EIO) -+ ret = -ENODEV; -+ goto err_free; -+ } -+ } else { -+ dev_warn(&client->dev, -+ "probe() called with invalid i2c device id\n"); -+ ret = -EINVAL; -+ } -+ -+ goto end; -+err_free: -+ devm_kfree(&client->dev, drv_data); -+end: -+ return ret; -+} -+ -+/* -+ * Callback for bus_find_device() used in ddcci_remove() -+ * -+ * Find next device on i2c_client not flagged with -+ * DDCCI_FLAG_REMOVED and flag it. -+ */ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,3,0) -+static int ddcci_remove_helper(struct device *dev, const void *p) -+#else -+static int ddcci_remove_helper(struct device *dev, void *p) -+#endif -+{ -+ struct ddcci_device *device; -+ -+ device = ddcci_verify_device(dev); -+ if (!device || device->flags & DDCCI_FLAG_REMOVED) -+ return 0; -+ -+ if (!p || (dev->parent == p)) { -+ device->flags |= DDCCI_FLAG_REMOVED; -+ wmb(); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/* I2C driver remove callback: unregister all subdevices */ -+static int ddcci_remove(struct i2c_client *client) -+{ -+ struct ddcci_bus_drv_data *drv_data = i2c_get_clientdata(client); -+ struct device *dev; -+ -+ down(&drv_data->sem); -+ while (1) { -+ dev = bus_find_device(&ddcci_bus_type, NULL, client, -+ ddcci_remove_helper); -+ if (!dev) -+ break; -+ device_unregister(dev); -+ put_device(dev); -+ } -+ up(&drv_data->sem); -+ return 0; -+} -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -+static void ddcci_remove_void(struct i2c_client *client) -+{ -+ ddcci_remove(client); -+} -+#endif -+ -+/* -+ * I2C driver device identification table. -+ */ -+static const struct i2c_device_id ddcci_idtable[] = { -+ { "ddcci", 0 }, -+ { "ddcci-dependent", 1 }, -+ {} -+}; -+MODULE_DEVICE_TABLE(i2c, ddcci_idtable); -+ -+/* -+ * I2C driver description structure -+ */ -+static struct i2c_driver ddcci_driver = { -+ .driver = { -+ .name = "ddcci", -+ .owner = THIS_MODULE, -+ }, -+ -+ .id_table = ddcci_idtable, -+ .probe = ddcci_probe, -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(6, 1, 0) -+ .remove = ddcci_remove_void, -+#else -+ .remove = ddcci_remove, -+#endif -+ .class = I2C_CLASS_DDC, -+ .detect = ddcci_detect, -+ .address_list = I2C_ADDRS( -+ DDCCI_DEFAULT_DEVICE_ADDR>>1 -+ ), -+}; -+ -+/* -+ * Module initialization function. Called when the module is inserted or -+ * (if builtin) at boot time. -+ */ -+static int __init ddcci_module_init(void) -+{ -+ int ret; -+ -+ pr_debug("initializing ddcci driver\n"); -+ /* Allocate a device number region for the character devices */ -+ ret = alloc_chrdev_region(&ddcci_cdev_first, 0, 128, DEVICE_NAME); -+ if (ret < 0) { -+ pr_err("failed to register device region: error %d\n", ret); -+ goto err_chrdevreg; -+ } -+ ddcci_cdev_next = ddcci_cdev_first; -+ ddcci_cdev_end = MKDEV(MAJOR(ddcci_cdev_first), MINOR(ddcci_cdev_first)+128); -+ -+ /* Register bus */ -+ ret = bus_register(&ddcci_bus_type); -+ if (ret) { -+ pr_err("failed to register bus 'ddcci'\n"); -+ goto err_busreg; -+ } -+ ddcci_bus_registered = true; -+ -+ /* Register I2C driver */ -+ ret = i2c_add_driver(&ddcci_driver); -+ if (ret) { -+ pr_err("failed to register i2c driver\n"); -+ goto err_drvreg; -+ } -+ -+ pr_debug("ddcci driver initialized\n"); -+ -+ return 0; -+ -+err_drvreg: -+ bus_unregister(&ddcci_bus_type); -+err_busreg: -+ unregister_chrdev_region(ddcci_cdev_first, 128); -+err_chrdevreg: -+ return ret; -+} -+ -+/* -+ * Module clean-up function. Called when the module is removed. -+ */ -+static void __exit ddcci_module_exit(void) -+{ -+ struct device *dev; -+ -+ while (1) { -+ dev = bus_find_device(&ddcci_bus_type, NULL, NULL, ddcci_remove_helper); -+ if (!dev) -+ break; -+ device_unregister(dev); -+ put_device(dev); -+ } -+ -+ i2c_del_driver(&ddcci_driver); -+ bus_unregister(&ddcci_bus_type); -+ unregister_chrdev_region(ddcci_cdev_first, 128); -+} -+ -+/* Let the kernel know the calls for module init and exit */ -+module_init(ddcci_module_init); -+module_exit(ddcci_module_exit); -+ -+/* Module parameter description */ -+module_param(delay, uint, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(delay, "default delay after bus writes (in ms, default 60)"); -+module_param_array(autoprobe_addrs, ushort, &autoprobe_addr_count, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(autoprobe_addrs, "internal dependent device addresses to autoprobe"); -+ -+/* Module description */ -+MODULE_AUTHOR("Christoph Grenz"); -+MODULE_DESCRIPTION("DDC/CI bus driver"); -+MODULE_VERSION("0.4.2"); -+MODULE_LICENSE("GPL"); -diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig -index 51387b1ef012..4b8bfd7c02c6 100644 ---- a/drivers/video/backlight/Kconfig -+++ b/drivers/video/backlight/Kconfig -@@ -297,6 +297,17 @@ config BACKLIGHT_QCOM_WLED - If you have the Qualcomm PMIC, say Y to enable a driver for the - WLED block. Currently it supports PM8941 and PMI8998. - -+config BACKLIGHT_DDCCI -+ tristate "DDCCI Backlight Driver" -+ depends on DDCCI -+ help -+ If you have a DDC/CI supporing monitor, say Y to enable a driver -+ to control its backlight using DDC/CI. This could be useful if -+ your monitor does not include a backlight driver. For this to be -+ useful you need to enable DDCCI support which can be found in -+ Device Drivers -> Character devices and that further depends on -+ I2C. -+ - config BACKLIGHT_RT4831 - tristate "Richtek RT4831 Backlight Driver" - depends on MFD_RT4831 -diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile -index f72e1c3c59e9..656dea21c0ee 100644 ---- a/drivers/video/backlight/Makefile -+++ b/drivers/video/backlight/Makefile -@@ -58,3 +58,4 @@ obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o - obj-$(CONFIG_BACKLIGHT_ARCXCNN) += arcxcnn_bl.o - obj-$(CONFIG_BACKLIGHT_RAVE_SP) += rave-sp-backlight.o - obj-$(CONFIG_BACKLIGHT_LED) += led_bl.o -+obj-$(CONFIG_BACKLIGHT_DDCCI) += ddcci-backlight.o -diff --git a/drivers/video/backlight/ddcci-backlight.c b/drivers/video/backlight/ddcci-backlight.c -new file mode 100644 -index 000000000000..7a9852207f0b ---- /dev/null -+++ b/drivers/video/backlight/ddcci-backlight.c -@@ -0,0 +1,413 @@ -+/* -+ * DDC/CI monitor backlight driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -+#include -+#include -+#include -+#include -+ -+#include -+ -+ -+#define DDCCI_COMMAND_READ 0x01 /* read ctrl value */ -+#define DDCCI_REPLY_READ 0x02 /* read ctrl value reply */ -+#define DDCCI_COMMAND_WRITE 0x03 /* write ctrl value */ -+#define DDCCI_COMMAND_SAVE 0x0c /* save current settings */ -+ -+#define DDCCI_MONITOR_LUMINANCE 0x10 -+#define DDCCI_MONITOR_BACKLIGHT 0x13 -+#define DDCCI_MONITOR_BL_WHITE 0x6B -+ -+static bool convenience_symlink = true; -+ -+struct ddcci_monitor_drv_data { -+ struct ddcci_device *device; -+ struct backlight_device *bl_dev; -+ struct device *fb_dev; -+ unsigned char used_vcp; -+}; -+ -+static int ddcci_monitor_writectrl(struct ddcci_device *device, -+ unsigned char ctrl, unsigned short value) -+{ -+ unsigned char buf[4]; -+ int ret; -+ -+ buf[0] = DDCCI_COMMAND_WRITE; -+ buf[1] = ctrl; -+ buf[2] = (value >> 8); -+ buf[3] = (value & 255); -+ -+ ret = ddcci_device_write(device, true, buf, sizeof(buf)); -+ -+ return ret; -+} -+ -+static int ddcci_monitor_readctrl(struct ddcci_device *device, -+ unsigned char ctrl, unsigned short *value, -+ unsigned short *maximum) -+{ -+ int ret; -+ unsigned char buf[10]; -+ -+ buf[0] = DDCCI_COMMAND_READ; -+ buf[1] = ctrl; -+ -+ ret = ddcci_device_writeread(device, true, buf, 2, sizeof(buf)); -+ if (ret < 0) -+ return ret; -+ -+ if (ret == 0) -+ return -ENOTSUPP; -+ -+ if (ret == 8 && buf[0] == DDCCI_REPLY_READ && buf[2] == ctrl) { -+ if (value) -+ *value = buf[6] * 256 + buf[7]; -+ -+ if (maximum) -+ *maximum = buf[4] * 256 + buf[5]; -+ -+ if (buf[1] == 1) -+ return -ENOTSUPP; -+ if (buf[1] != 0) -+ return -EIO; -+ return 0; -+ } -+ -+ return -EIO; -+} -+ -+static int ddcci_backlight_check_fb(struct backlight_device *bl, -+ struct fb_info *info) -+{ -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ -+ return drv_data->fb_dev == NULL || drv_data->fb_dev == info->dev; -+} -+ -+static int ddcci_backlight_update_status(struct backlight_device *bl) -+{ -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ int brightness = bl->props.brightness; -+ int ret; -+ -+ if (bl->props.power != FB_BLANK_UNBLANK || -+ bl->props.state & BL_CORE_FBBLANK) -+ brightness = 0; -+ -+ ret = ddcci_monitor_writectrl(drv_data->device, drv_data->used_vcp, -+ brightness); -+ if (ret > 0) -+ ret = 0; -+ return ret; -+} -+ -+static int ddcci_backlight_get_brightness(struct backlight_device *bl) -+{ -+ unsigned short value = 0, maxval = 0; -+ int ret; -+ struct ddcci_monitor_drv_data *drv_data = bl_get_data(bl); -+ -+ ret = ddcci_monitor_readctrl(drv_data->device, drv_data->used_vcp, -+ &value, &maxval); -+ if (ret < 0) -+ return ret; -+ -+ bl->props.brightness = value; -+ bl->props.max_brightness = maxval; -+ ret = value; -+ -+ return ret; -+} -+ -+static const struct backlight_ops ddcci_backlight_ops = { -+ .options = 0, -+ .update_status = ddcci_backlight_update_status, -+ .get_brightness = ddcci_backlight_get_brightness, -+ .check_fb = ddcci_backlight_check_fb, -+}; -+ -+static const char *ddcci_monitor_vcp_name(unsigned char vcp) -+{ -+ switch (vcp) { -+ case DDCCI_MONITOR_BL_WHITE: -+ return "backlight"; -+ case DDCCI_MONITOR_LUMINANCE: -+ return "luminance"; -+ default: -+ return "???"; -+ } -+} -+ -+static const char *ddcci_monitor_next_vcp_item(const char *ptr) -+{ -+ int depth = 0; -+ -+ /* Sanity check */ -+ if (unlikely(ptr == NULL || ptr[0] == '\0')) -+ return NULL; -+ -+ /* Find next white space outside of parentheses */ -+ while ((ptr = strpbrk(ptr, " ()"))) { -+ if (!ptr || depth == INT_MAX) { -+ return NULL; -+ } else if (*ptr == '(') { -+ depth++; -+ } else if (depth > 0) { -+ if (*ptr == ')') -+ depth--; -+ } else { -+ break; -+ } -+ ++ptr; -+ } -+ -+ /* Skip over whitespace */ -+ ptr = skip_spaces(ptr); -+ -+ /* Check if we're now at the end of the list */ -+ if (unlikely(*ptr == '\0' || *ptr == ')')) -+ return NULL; -+ -+ return ptr; -+} -+ -+static bool ddcci_monitor_find_vcp(unsigned char vcp, const char *s) -+{ -+ const char *ptr = s; -+ char vcp_hex[3]; -+ -+ /* Sanity check */ -+ if (unlikely(s == NULL || s[0] == '\0')) -+ return false; -+ -+ /* Create hex representation of VCP */ -+ if (unlikely(snprintf(vcp_hex, 3, "%02hhX", vcp) != 2)) { -+ pr_err("snprintf failed to convert to hex. This should not happen.\n"); -+ return false; -+ } -+ -+ /* Search for it */ -+ do { -+ if (strncasecmp(vcp_hex, ptr, 2) == 0) { -+ if (ptr[2] == ' ' || ptr[2] == '(' || ptr[2] == ')') { -+ return true; -+ } -+ } -+ } while ((ptr = ddcci_monitor_next_vcp_item(ptr))); -+ -+ return false; -+} -+ -+static int ddcci_backlight_create_symlink(struct ddcci_device *ddcci_dev) -+{ -+ int i, result; -+ struct device *dev = &ddcci_dev->dev; -+ struct kernfs_node *dirent; -+ for (i = 0; i < 3; ++i) { -+ dev = dev->parent; -+ if (!dev) { -+ dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: ancestor device not found\n"); -+ return -ENOENT; -+ } -+ } -+ dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight"); -+ if (dirent) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "failed to create convenience symlink: %s/ddcci_backlight already exists\n", dev_name(dev)); -+ return -EEXIST; -+ } -+ -+ result = sysfs_create_link(&dev->kobj, &ddcci_dev->dev.kobj, "ddcci_backlight"); -+ if (result == 0) { -+ dev_dbg(&ddcci_dev->dev, "created symlink %s/ddcci_backlight\n", dev_name(dev)); -+ } else { -+ dev_info(&ddcci_dev->dev, "failed to create convenience symlink: %d\n", result); -+ } -+ return result; -+} -+ -+static int ddcci_backlight_remove_symlink(struct ddcci_device *ddcci_dev) -+{ -+ int i; -+ struct device *dev = &ddcci_dev->dev; -+ struct kernfs_node *dirent; -+ for (i = 0; i < 3; ++i) { -+ dev = dev->parent; -+ if (!dev) -+ return -ENOENT; -+ } -+ dirent = sysfs_get_dirent(dev->kobj.sd, "ddcci_backlight"); -+ if (!dirent) { -+ return -ENOENT; -+ } -+ -+ if ((dirent->flags & KERNFS_LINK) == 0) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: not a symlink\n", dev_name(dev)); -+ return -EINVAL; -+ } -+ -+ if (dirent->symlink.target_kn != ddcci_dev->dev.kobj.sd) { -+ sysfs_put(dirent); -+ dev_dbg(&ddcci_dev->dev, "won't remove %s/ddcci_backlight: we are not the link target\n", dev_name(dev)); -+ return -EINVAL; -+ } -+ -+ sysfs_put(dirent); -+ -+ sysfs_remove_link(&dev->kobj, "ddcci_backlight"); -+ dev_dbg(&ddcci_dev->dev, "removed symlink %s/ddcci_backlight\n", dev_name(dev)); -+ return 0; -+} -+ -+static int ddcci_monitor_probe(struct ddcci_device *dev, -+ const struct ddcci_device_id *id) -+{ -+ struct ddcci_monitor_drv_data *drv_data; -+ struct backlight_properties props; -+ struct backlight_device *bl = NULL; -+ int ret = 0; -+ bool support_luminance, support_bl_white; -+ unsigned short brightness = 0, max_brightness = 0; -+ const char *vcps; -+ -+ dev_dbg(&dev->dev, "probing monitor backlight device\n"); -+ -+ /* Get VCP list */ -+ vcps = ddcci_find_capstr_item(dev->capabilities, "vcp", NULL); -+ if (IS_ERR(vcps)) { -+ dev_info(&dev->dev, -+ "monitor doesn't provide a list of supported controls.\n"); -+ support_bl_white = support_luminance = true; -+ } else { -+ /* Check VCP list for supported VCPs */ -+ support_bl_white = ddcci_monitor_find_vcp(DDCCI_MONITOR_BL_WHITE, vcps); -+ support_luminance = ddcci_monitor_find_vcp(DDCCI_MONITOR_LUMINANCE, vcps); -+ /* Fallback to trying if no support is found */ -+ if (!support_bl_white && !support_luminance) { -+ dev_info(&dev->dev, -+ "monitor doesn't announce support for backlight or luminance controls.\n"); -+ support_bl_white = support_luminance = true; -+ } -+ } -+ -+ /* Initialize driver data structure */ -+ drv_data = devm_kzalloc(&dev->dev, sizeof(struct ddcci_monitor_drv_data), -+ GFP_KERNEL); -+ if (!drv_data) -+ return -ENOMEM; -+ drv_data->device = dev; -+ -+ if (support_bl_white) { -+ /* Try getting backlight level */ -+ dev_dbg(&dev->dev, -+ "trying to access \"backlight level white\" control\n"); -+ ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_BL_WHITE, -+ &brightness, &max_brightness); -+ if (ret < 0) { -+ if (ret == -ENOTSUPP) -+ dev_info(&dev->dev, -+ "monitor does not support reading backlight level\n"); -+ else -+ goto err_free; -+ } else { -+ drv_data->used_vcp = DDCCI_MONITOR_BL_WHITE; -+ } -+ } -+ -+ if (support_luminance && !drv_data->used_vcp) { -+ /* Try getting luminance */ -+ dev_dbg(&dev->dev, -+ "trying to access \"luminance\" control\n"); -+ ret = ddcci_monitor_readctrl(drv_data->device, DDCCI_MONITOR_LUMINANCE, -+ &brightness, &max_brightness); -+ if (ret < 0) { -+ if (ret == -ENOTSUPP) -+ dev_info(&dev->dev, -+ "monitor does not support reading luminance\n"); -+ else -+ goto err_free; -+ } else { -+ drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE; -+ } -+ drv_data->used_vcp = DDCCI_MONITOR_LUMINANCE; -+ } -+ -+ if (!drv_data->used_vcp) -+ goto err_free; -+ -+ /* Create brightness device */ -+ memset(&props, 0, sizeof(props)); -+ props.type = BACKLIGHT_RAW; -+ props.max_brightness = max_brightness; -+ props.brightness = brightness; -+ bl = devm_backlight_device_register(&dev->dev, dev_name(&dev->dev), -+ &dev->dev, drv_data, -+ &ddcci_backlight_ops, &props); -+ drv_data->bl_dev = bl; -+ if (IS_ERR(bl)) { -+ dev_err(&dev->dev, "failed to register backlight\n"); -+ return PTR_ERR(bl); -+ } -+ dev_info(&dev->dev, "registered %s as backlight device %s\n", -+ ddcci_monitor_vcp_name(drv_data->used_vcp), -+ dev_name(&dev->dev)); -+ -+ if (convenience_symlink) { -+ ddcci_backlight_create_symlink(dev); -+ } -+ -+ goto end; -+err_free: -+ devm_kfree(&dev->dev, drv_data); -+end: -+ return ret; -+} -+ -+static int ddcci_monitor_remove(struct ddcci_device *dev) -+{ -+ dev_dbg(&dev->dev, "removing device\n"); -+ ddcci_backlight_remove_symlink(dev); -+ return 0; -+} -+ -+static struct ddcci_device_id ddcci_monitor_idtable[] = { -+ { "monitor", DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, DDCCI_ANY_ID, 0 }, -+ {} -+}; -+ -+static struct ddcci_driver ddcci_backlight_driver = { -+ .driver = { -+ .name = "ddcci-backlight", -+ .owner = THIS_MODULE, -+ }, -+ -+ .id_table = ddcci_monitor_idtable, -+ .probe = ddcci_monitor_probe, -+ .remove = ddcci_monitor_remove, -+}; -+ -+module_ddcci_driver(ddcci_backlight_driver); -+ -+/* Module parameter description */ -+module_param(convenience_symlink, bool, S_IRUGO|S_IWUSR); -+MODULE_PARM_DESC(convenience_symlink, "add convenience symlink \"ddcci_backlight\" to ancestor device in sysfs (default true)"); -+ -+MODULE_AUTHOR("Christoph Grenz"); -+MODULE_DESCRIPTION("DDC/CI generic monitor backlight driver"); -+MODULE_VERSION("0.4.2"); -+MODULE_LICENSE("GPL"); -+ -+MODULE_ALIAS("ddcci:monitor-*-*-*-*"); -diff --git a/include/linux/ddcci.h b/include/linux/ddcci.h -new file mode 100644 -index 000000000000..a219f031e584 ---- /dev/null -+++ b/include/linux/ddcci.h -@@ -0,0 +1,164 @@ -+/* -+ * DDC/CI bus driver -+ * -+ * Copyright (c) 2015 Christoph Grenz -+ */ -+ -+/* -+ * This program is free software; you can redistribute it and/or modify it -+ * under the terms of the GNU General Public License as published by the Free -+ * Software Foundation; either version 2 of the License, or (at your option) -+ * any later version. -+ */ -+ -+#ifndef _DDCCI_H -+#define _DDCCI_H -+ -+#include -+#include -+#include -+ -+#define DDCCI_MODULE_PREFIX "ddcci:" -+ -+/* Special addresses */ -+ -+/* default device address (even) */ -+#define DDCCI_DEFAULT_DEVICE_ADDR 0x6E -+/* receiving host address for communication with default device address */ -+#define DDCCI_HOST_ADDR_EVEN 0x50 -+/* sending host address for communication with default device address */ -+#define DDCCI_HOST_ADDR_ODD 0x51 -+ -+/* Command codes */ -+ -+/* Identification Request */ -+#define DDCCI_COMMAND_ID 0xf1 -+/* Identification Reply */ -+#define DDCCI_REPLY_ID 0xe1 -+/* Capabilities Request */ -+#define DDCCI_COMMAND_CAPS 0xf3 -+/* Capabilities Reply */ -+#define DDCCI_REPLY_CAPS 0xe3 -+ -+/* Quirks */ -+ -+/* Device always responds with unset protocol flag */ -+#define DDCCI_QUIRK_NO_PFLAG BIT(1) -+/* Device needs writing one byte at a time */ -+#define DDCCI_QUIRK_WRITE_BYTEWISE BIT(2) -+/* Device repeats first byte on read */ -+#define DDCCI_QUIRK_SKIP_FIRST_BYTE BIT(3) -+ -+/* Flags */ -+ -+#define DDCCI_FLAG_REMOVED BIT(1) -+#define DDCCI_FLAG_DEPENDENT BIT(2) -+#define DDCCI_FLAG_EXTERNAL BIT(3) -+ -+extern struct bus_type ddcci_bus_type; -+ -+struct ddcci_bus_drv_data; -+ -+/* struct ddcci_device_id - identifies DDC/CI devices for probing */ -+struct ddcci_device_id { -+ char prot[9]; -+ char type[9]; -+ char model[9]; -+ char vendor[9]; -+ char module[9]; -+ kernel_ulong_t driver_data; /* Data private to the driver */ -+}; -+#define DDCCI_ANY_ID "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF" -+ -+/** -+ * struct ddcci_device - represent an DDC/CI device -+ * @outer_addr: Outer device address (I2C address << 1). -+ * @inner_addr: Inner device address. -+ * @flags: Device flags. -+ * @capabilities: Device capability string. -+ * @capabilities_len: Length of capability string. -+ * @i2c_client: Parent I2C device. -+ * @bus_drv_data: Driver internal data structure. -+ * @dev: Driver model device node for the slave. -+ * @cdev: Character device structure -+ * @cdev_sem: RW semaphore for exclusive access on character device. -+ * @prot: Device class ("protocol", from capability string) -+ * @type: Device subclass ("type", from capability string) -+ * @model: Device model (from capability string) -+ * @vendor: Device vendor (from identification command response) -+ * @module: Device module (from identification command response) -+ * @device_number: Device serial (from identification command response) -+ */ -+struct ddcci_device { -+ unsigned short outer_addr; -+ unsigned short inner_addr; -+ int flags; -+ char *capabilities; -+ size_t capabilities_len; -+ struct i2c_client *i2c_client; -+ struct ddcci_bus_drv_data *bus_drv_data; -+ struct device dev; -+ struct cdev cdev; -+ struct rw_semaphore cdev_sem; -+ char prot[9]; -+ char type[9]; -+ char model[9]; -+ char vendor[9]; -+ char module[9]; -+ int device_number; -+}; -+#define to_ddcci_device(d) container_of(d, struct ddcci_device, dev) -+ -+/** -+ * struct ddcci_driver - represent an DDC/CI device driver -+ * @probe: Callback for device binding -+ * @remove: Callback for device unbinding -+ * @driver: Device driver model driver -+ * @id_table: List of DDC/CI devices supported by this driver -+ * -+ * The driver.owner field should be set to the module owner of this driver. -+ * The driver.name field should be set to the name of this driver. -+ */ -+struct ddcci_driver { -+ int (*probe)(struct ddcci_device *, const struct ddcci_device_id *); -+ int (*remove)(struct ddcci_device *); -+ struct device_driver driver; -+ struct ddcci_device_id *id_table; -+}; -+#define to_ddcci_driver(d) container_of(d, struct ddcci_driver, driver) -+ -+int ddcci_register_driver(struct module *owner, struct ddcci_driver *driver); -+#define ddcci_add_driver(driver) \ -+ ddcci_register_driver(THIS_MODULE, driver) -+void ddcci_del_driver(struct ddcci_driver *driver); -+ -+struct ddcci_device *ddcci_verify_device(struct device *dev); -+ -+#define module_ddcci_driver(__ddcci_driver) \ -+ module_driver(__ddcci_driver, ddcci_add_driver, \ -+ ddcci_del_driver) -+ -+int ddcci_device_write(struct ddcci_device *, bool p_flag, unsigned char *data, -+ unsigned char length); -+int ddcci_device_read(struct ddcci_device *, bool p_flag, unsigned char *buffer, -+ unsigned char length); -+int ddcci_device_writeread(struct ddcci_device *, bool p_flag, -+ unsigned char *buffer, unsigned char length, -+ unsigned char maxlength); -+ -+static inline void *ddcci_get_drvdata(const struct ddcci_device *dev) -+{ -+ return dev_get_drvdata(&dev->dev); -+} -+ -+static inline void ddcci_set_drvdata(struct ddcci_device *dev, void *data) -+{ -+ dev_set_drvdata(&dev->dev, data); -+} -+ -+unsigned long ddcci_quirks(struct ddcci_device *dev); -+ -+const char *ddcci_find_capstr_item(const char *capabilities, const char *tag, -+ size_t *length); -+ -+#endif --- -2.41.0 - -From 87a0f4f9c6f362b5fb226e811b018ec597ee9ee7 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Sat, 1 Jul 2023 15:11:45 +0200 -Subject: [PATCH 4/8] fixes +Date: Tue, 11 Jul 2023 19:25:14 +0200 +Subject: [PATCH 3/7] fixes Signed-off-by: Peter Jung --- @@ -12472,9 +9635,10 @@ Signed-off-by: Peter Jung lib/decompress_unxz.c | 2 + lib/decompress_unzstd.c | 2 + mm/mmap.c | 9 +- + net/netfilter/nf_tables_api.c | 2 + scripts/Makefile.vmlinux_o | 2 +- sound/pci/hda/cs35l41_hda.c | 2 +- - 16 files changed, 1494 insertions(+), 11 deletions(-) + 17 files changed, 1496 insertions(+), 11 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c @@ -14102,10 +11266,10 @@ index a512b99ae16a..bba2c0bb10cb 100644 #include diff --git a/mm/mmap.c b/mm/mmap.c -index bc510361acec..d3d939d8410a 100644 +index 30bf7772d4ac..5c5a917b261e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c -@@ -2476,7 +2476,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, +@@ -2480,7 +2480,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } vma_start_write(next); mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); @@ -14115,7 +11279,7 @@ index bc510361acec..d3d939d8410a 100644 goto munmap_gather_failed; vma_mark_detached(next, true); if (next->vm_flags & VM_LOCKED) -@@ -2525,12 +2526,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, +@@ -2529,12 +2530,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, BUG_ON(count != test_count); } #endif @@ -14131,6 +11295,19 @@ index bc510361acec..d3d939d8410a 100644 mm->locked_vm -= locked_vm; mm->map_count -= count; /* +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 4c7937fd803f..1d64c163076a 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -5343,6 +5343,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, + nft_set_trans_unbind(ctx, set); + if (nft_set_is_anonymous(set)) + nft_deactivate_next(ctx->net, set); ++ else ++ list_del_rcu(&binding->list); + + set->use--; + break; diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index 0edfdb40364b..ae52d3b3f063 100644 --- a/scripts/Makefile.vmlinux_o @@ -14160,10 +11337,10 @@ index b5210abb5141..4d8936e1f769 100644 -- 2.41.0 -From d6acc3c1d3c68f7b733f4927dbd37ccb42da1d72 Mon Sep 17 00:00:00 2001 +From cf10d13e4dc6b189366dd15713486e50d71aa718 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Thu, 1 Jun 2023 16:35:38 +0200 -Subject: [PATCH 5/8] ksm +Date: Tue, 11 Jul 2023 19:25:28 +0200 +Subject: [PATCH 4/7] ksm Signed-off-by: Peter Jung --- @@ -14612,10 +11789,10 @@ index 860b2dcf3ac4..96fe36a6d0f5 100644 -- 2.41.0 -From 4a9bbe5211278e439c56852a36415d34e406362d Mon Sep 17 00:00:00 2001 +From 6118edead4a8f108c5069f9629816f4d52b2a131 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 26 Jun 2023 17:01:13 +0200 -Subject: [PATCH 6/8] kvm-lru +Date: Tue, 11 Jul 2023 19:25:38 +0200 +Subject: [PATCH 5/7] kvm-lru Signed-off-by: Peter Jung --- @@ -16060,26 +13237,31 @@ index 65f94f592ff8..9db05880b6b9 100644 -- 2.41.0 -From cc7bb29d63840bc98e06aff203fab5d2bf9f0691 Mon Sep 17 00:00:00 2001 +From a87c6bebcb1e942bbc824d451e0a93efb954116c Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Thu, 1 Jun 2023 16:35:57 +0200 -Subject: [PATCH 7/8] sched +Date: Tue, 11 Jul 2023 19:25:53 +0200 +Subject: [PATCH 6/7] sched Signed-off-by: Peter Jung --- - arch/x86/kernel/itmt.c | 23 +-- - arch/x86/kernel/smpboot.c | 4 +- + arch/x86/kernel/itmt.c | 23 +- + arch/x86/kernel/smpboot.c | 7 +- + include/linux/cgroup-defs.h | 2 + + include/linux/sched.h | 2 + include/linux/sched/sd_flags.h | 5 +- - kernel/cgroup/cgroup.c | 12 ++ - kernel/sched/core.c | 59 ++++++-- + include/linux/sched/task.h | 38 ++- + kernel/cgroup/cgroup.c | 46 ++++ + kernel/fork.c | 8 + + kernel/sched/core.c | 116 +++++++++- kernel/sched/deadline.c | 7 - - kernel/sched/debug.c | 2 +- - kernel/sched/fair.c | 255 +++++++++++++++++++-------------- + kernel/sched/debug.c | 3 +- + kernel/sched/fair.c | 409 ++++++++++++++++++++++++--------- kernel/sched/features.h | 1 + - kernel/sched/psi.c | 19 +-- - kernel/sched/sched.h | 29 ++-- - kernel/sched/topology.c | 7 +- - 12 files changed, 253 insertions(+), 170 deletions(-) + kernel/sched/psi.c | 21 +- + kernel/sched/sched.h | 32 ++- + kernel/sched/topology.c | 21 +- + kernel/softirq.c | 2 +- + 17 files changed, 568 insertions(+), 175 deletions(-) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 670eb08b972a..ee4fe8cdb857 100644 @@ -16124,7 +13306,7 @@ index 670eb08b972a..ee4fe8cdb857 100644 + per_cpu(sched_core_priority, cpu) = prio; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c -index 483df0427678..bc64fd5ca69e 100644 +index 483df0427678..a81f2b0dbbad 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -571,7 +571,7 @@ static int x86_core_flags(void) @@ -16136,7 +13318,13 @@ index 483df0427678..bc64fd5ca69e 100644 } #endif #ifdef CONFIG_SCHED_CLUSTER -@@ -602,7 +602,7 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = { +@@ -599,10 +599,13 @@ static struct sched_domain_topology_level x86_hybrid_topology[] = { + #ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, + #endif ++#ifdef CONFIG_SCHED_CLUSTER ++ { cpu_clustergroup_mask, x86_cluster_flags, SD_INIT_NAME(CLS) }, ++#endif #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif @@ -16145,6 +13333,35 @@ index 483df0427678..bc64fd5ca69e 100644 { NULL, }, }; +diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h +index 8a0d5466c7be..ae20dbb885d6 100644 +--- a/include/linux/cgroup-defs.h ++++ b/include/linux/cgroup-defs.h +@@ -661,6 +661,8 @@ struct cgroup_subsys { + void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); + int (*css_extra_stat_show)(struct seq_file *seq, + struct cgroup_subsys_state *css); ++ int (*css_local_stat_show)(struct seq_file *seq, ++ struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_taskset *tset); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index eed5d65b8d1f..8473324705ca 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2437,9 +2437,11 @@ extern void sched_core_free(struct task_struct *tsk); + extern void sched_core_fork(struct task_struct *p); + extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, + unsigned long uaddr); ++extern int sched_core_idle_cpu(int cpu); + #else + static inline void sched_core_free(struct task_struct *tsk) { } + static inline void sched_core_fork(struct task_struct *p) { } ++static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } + #endif + + extern void sched_set_stop_task(int cpu, struct task_struct *stop); diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index 57bde66d95f7..fad77b5172e2 100644 --- a/include/linux/sched/sd_flags.h @@ -16163,11 +13380,101 @@ index 57bde66d95f7..fad77b5172e2 100644 /* * Prefer to place tasks in a sibling domain +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index e0f5ac90a228..b53909027771 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -118,11 +118,47 @@ static inline struct task_struct *get_task_struct(struct task_struct *t) + } + + extern void __put_task_struct(struct task_struct *t); ++extern void __put_task_struct_rcu_cb(struct rcu_head *rhp); + + static inline void put_task_struct(struct task_struct *t) + { +- if (refcount_dec_and_test(&t->usage)) ++ if (!refcount_dec_and_test(&t->usage)) ++ return; ++ ++ /* ++ * In !RT, it is always safe to call __put_task_struct(). ++ * Under RT, we can only call it in preemptible context. ++ */ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) || preemptible()) { ++ static DEFINE_WAIT_OVERRIDE_MAP(put_task_map, LD_WAIT_SLEEP); ++ ++ lock_map_acquire_try(&put_task_map); + __put_task_struct(t); ++ lock_map_release(&put_task_map); ++ return; ++ } ++ ++ /* ++ * under PREEMPT_RT, we can't call put_task_struct ++ * in atomic context because it will indirectly ++ * acquire sleeping locks. ++ * ++ * call_rcu() will schedule delayed_put_task_struct_rcu() ++ * to be called in process context. ++ * ++ * __put_task_struct() is called when ++ * refcount_dec_and_test(&t->usage) succeeds. ++ * ++ * This means that it can't "conflict" with ++ * put_task_struct_rcu_user() which abuses ->rcu the same ++ * way; rcu_users has a reference so task->usage can't be ++ * zero after rcu_users 1 -> 0 transition. ++ * ++ * delayed_free_task() also uses ->rcu, but it is only called ++ * when it fails to fork a process. Therefore, there is no ++ * way it can conflict with put_task_struct(). ++ */ ++ call_rcu(&t->rcu, __put_task_struct_rcu_cb); + } + + static inline void put_task_struct_many(struct task_struct *t, int nr) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c -index 4d42f0cbc11e..8f917f682f52 100644 +index 4d42f0cbc11e..b782ae876c84 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c -@@ -3891,6 +3891,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, +@@ -3740,6 +3740,36 @@ static int cpu_stat_show(struct seq_file *seq, void *v) + return ret; + } + ++static int __maybe_unused cgroup_local_stat_show(struct seq_file *seq, ++ struct cgroup *cgrp, int ssid) ++{ ++ struct cgroup_subsys *ss = cgroup_subsys[ssid]; ++ struct cgroup_subsys_state *css; ++ int ret; ++ ++ if (!ss->css_local_stat_show) ++ return 0; ++ ++ css = cgroup_tryget_css(cgrp, ss); ++ if (!css) ++ return 0; ++ ++ ret = ss->css_local_stat_show(seq, css); ++ css_put(css); ++ return ret; ++} ++ ++static int cpu_local_stat_show(struct seq_file *seq, void *v) ++{ ++ struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; ++ int ret = 0; ++ ++#ifdef CONFIG_CGROUP_SCHED ++ ret = cgroup_local_stat_show(seq, cgrp, cpu_cgrp_id); ++#endif ++ return ret; ++} ++ + #ifdef CONFIG_PSI + static int cgroup_io_pressure_show(struct seq_file *seq, void *v) + { +@@ -3891,6 +3921,14 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } @@ -16182,7 +13489,18 @@ index 4d42f0cbc11e..8f917f682f52 100644 static void cgroup_pressure_release(struct kernfs_open_file *of) { struct cgroup_file_ctx *ctx = of->priv; -@@ -5290,6 +5298,7 @@ static struct cftype cgroup_psi_files[] = { +@@ -5282,6 +5320,10 @@ static struct cftype cgroup_base_files[] = { + .name = "cpu.stat", + .seq_show = cpu_stat_show, + }, ++ { ++ .name = "cpu.stat.local", ++ .seq_show = cpu_local_stat_show, ++ }, + { } /* terminate */ + }; + +@@ -5290,6 +5332,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "io.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]), @@ -16190,7 +13508,7 @@ index 4d42f0cbc11e..8f917f682f52 100644 .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, -@@ -5298,6 +5307,7 @@ static struct cftype cgroup_psi_files[] = { +@@ -5298,6 +5341,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "memory.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]), @@ -16198,7 +13516,7 @@ index 4d42f0cbc11e..8f917f682f52 100644 .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, -@@ -5306,6 +5316,7 @@ static struct cftype cgroup_psi_files[] = { +@@ -5306,6 +5350,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "cpu.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]), @@ -16206,7 +13524,7 @@ index 4d42f0cbc11e..8f917f682f52 100644 .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, -@@ -5315,6 +5326,7 @@ static struct cftype cgroup_psi_files[] = { +@@ -5315,6 +5360,7 @@ static struct cftype cgroup_psi_files[] = { { .name = "irq.pressure", .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]), @@ -16214,8 +13532,27 @@ index 4d42f0cbc11e..8f917f682f52 100644 .seq_show = cgroup_irq_pressure_show, .write = cgroup_irq_pressure_write, .poll = cgroup_pressure_poll, +diff --git a/kernel/fork.c b/kernel/fork.c +index f405763e06ae..47a1967b6a55 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -993,6 +993,14 @@ void __put_task_struct(struct task_struct *tsk) + } + EXPORT_SYMBOL_GPL(__put_task_struct); + ++void __put_task_struct_rcu_cb(struct rcu_head *rhp) ++{ ++ struct task_struct *task = container_of(rhp, struct task_struct, rcu); ++ ++ __put_task_struct(task); ++} ++EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); ++ + void __init __weak arch_task_cache_init(void) { } + + /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index a68d1276bab0..bcb3a7e684ca 100644 +index a68d1276bab0..1b971c69d3a2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3341,6 +3341,39 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, @@ -16327,6 +13664,98 @@ index a68d1276bab0..bcb3a7e684ca 100644 } /* +@@ -7342,6 +7379,19 @@ struct task_struct *idle_task(int cpu) + return cpu_rq(cpu)->idle; + } + ++#ifdef CONFIG_SCHED_CORE ++int sched_core_idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (sched_core_enabled(rq) && rq->curr == rq->idle) ++ return 1; ++ ++ return idle_cpu(cpu); ++} ++ ++#endif ++ + #ifdef CONFIG_SMP + /* + * This function computes an effective utilization for the given CPU, to be +@@ -11103,6 +11153,27 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v) + + return 0; + } ++ ++static u64 throttled_time_self(struct task_group *tg) ++{ ++ int i; ++ u64 total = 0; ++ ++ for_each_possible_cpu(i) { ++ total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); ++ } ++ ++ return total; ++} ++ ++static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v) ++{ ++ struct task_group *tg = css_tg(seq_css(sf)); ++ ++ seq_printf(sf, "throttled_time %llu\n", throttled_time_self(tg)); ++ ++ return 0; ++} + #endif /* CONFIG_CFS_BANDWIDTH */ + #endif /* CONFIG_FAIR_GROUP_SCHED */ + +@@ -11179,6 +11250,10 @@ static struct cftype cpu_legacy_files[] = { + .name = "stat", + .seq_show = cpu_cfs_stat_show, + }, ++ { ++ .name = "stat.local", ++ .seq_show = cpu_cfs_local_stat_show, ++ }, + #endif + #ifdef CONFIG_RT_GROUP_SCHED + { +@@ -11235,6 +11310,24 @@ static int cpu_extra_stat_show(struct seq_file *sf, + return 0; + } + ++static int cpu_local_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++#ifdef CONFIG_CFS_BANDWIDTH ++ { ++ struct task_group *tg = css_tg(css); ++ u64 throttled_self_usec; ++ ++ throttled_self_usec = throttled_time_self(tg); ++ do_div(throttled_self_usec, NSEC_PER_USEC); ++ ++ seq_printf(sf, "throttled_usec %llu\n", ++ throttled_self_usec); ++ } ++#endif ++ return 0; ++} ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +@@ -11413,6 +11506,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, ++ .css_local_stat_show = cpu_local_stat_show, + #ifdef CONFIG_RT_GROUP_SCHED + .can_attach = cpu_cgroup_can_attach, + #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5a9a4b81c972..f827067ad03b 100644 --- a/kernel/sched/deadline.c @@ -16346,10 +13775,18 @@ index 5a9a4b81c972..f827067ad03b 100644 { raw_spin_lock_init(&dl_b->lock); diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 0b2340a79b65..066ff1c8ae4e 100644 +index 0b2340a79b65..aeeba46a096b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -777,7 +777,7 @@ static void print_cpu(struct seq_file *m, int cpu) +@@ -427,6 +427,7 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) + #undef SDM + + debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); ++ debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + } + + void update_sched_domain_debugfs(void) +@@ -777,7 +778,7 @@ static void print_cpu(struct seq_file *m, int cpu) #define P(x) \ do { \ if (sizeof(rq->x) == 4) \ @@ -16359,7 +13796,7 @@ index 0b2340a79b65..066ff1c8ae4e 100644 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x));\ } while (0) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 9671df93d1f5..9fe8288b1b1f 100644 +index 9671df93d1f5..64cbea29b007 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1082,6 +1082,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -16410,7 +13847,87 @@ index 9671df93d1f5..9fe8288b1b1f 100644 struct task_numa_env { struct task_struct *p; -@@ -7045,6 +7045,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool +@@ -4805,6 +4805,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + } + + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); ++static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + + static inline bool cfs_bandwidth_used(void); + +@@ -4891,8 +4892,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + if (cfs_rq->nr_running == 1) { + check_enqueue_throttle(cfs_rq); +- if (!throttled_hierarchy(cfs_rq)) ++ if (!throttled_hierarchy(cfs_rq)) { + list_add_leaf_cfs_rq(cfs_rq); ++ } else { ++#ifdef CONFIG_CFS_BANDWIDTH ++ struct rq *rq = rq_of(cfs_rq); ++ ++ if (cfs_rq_throttled(cfs_rq) && !cfs_rq->throttled_clock) ++ cfs_rq->throttled_clock = rq_clock(rq); ++ if (!cfs_rq->throttled_clock_self) ++ cfs_rq->throttled_clock_self = rq_clock(rq); ++#endif ++ } + } + } + +@@ -5395,6 +5406,17 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); ++ ++ if (cfs_rq->throttled_clock_self) { ++ u64 delta = rq_clock(rq) - cfs_rq->throttled_clock_self; ++ ++ cfs_rq->throttled_clock_self = 0; ++ ++ if (SCHED_WARN_ON((s64)delta < 0)) ++ delta = 0; ++ ++ cfs_rq->throttled_clock_self_time += delta; ++ } + } + + return 0; +@@ -5409,6 +5431,10 @@ static int tg_throttle_down(struct task_group *tg, void *data) + if (!cfs_rq->throttle_count) { + cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq); + list_del_leaf_cfs_rq(cfs_rq); ++ ++ SCHED_WARN_ON(cfs_rq->throttled_clock_self); ++ if (cfs_rq->nr_running) ++ cfs_rq->throttled_clock_self = rq_clock(rq); + } + cfs_rq->throttle_count++; + +@@ -5498,7 +5524,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + * throttled-list. rq->lock protects completion. + */ + cfs_rq->throttled = 1; +- cfs_rq->throttled_clock = rq_clock(rq); ++ SCHED_WARN_ON(cfs_rq->throttled_clock); ++ if (cfs_rq->nr_running) ++ cfs_rq->throttled_clock = rq_clock(rq); + return true; + } + +@@ -5516,7 +5544,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + update_rq_clock(rq); + + raw_spin_lock(&cfs_b->lock); +- cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; ++ if (cfs_rq->throttled_clock) { ++ cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; ++ cfs_rq->throttled_clock = 0; ++ } + list_del_rcu(&cfs_rq->throttled_list); + raw_spin_unlock(&cfs_b->lock); + +@@ -7045,6 +7076,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return idle_cpu; } @@ -16448,7 +13965,7 @@ index 9671df93d1f5..9fe8288b1b1f 100644 /* * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which * the task fits. If no CPU is big enough, but there are idle ones, try to -@@ -7217,6 +7248,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) +@@ -7217,6 +7279,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; @@ -16461,7 +13978,37 @@ index 9671df93d1f5..9fe8288b1b1f 100644 return target; } -@@ -9349,96 +9386,61 @@ group_type group_classify(unsigned int imbalance_pct, +@@ -8358,6 +8426,11 @@ enum group_type { + * more powerful CPU. + */ + group_misfit_task, ++ /* ++ * Balance SMT group that's fully busy. Can benefit from migration ++ * a task on SMT with busy sibling to another CPU on idle core. ++ */ ++ group_smt_balance, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the +@@ -9066,6 +9139,7 @@ struct sg_lb_stats { + unsigned int group_weight; + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ ++ unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -9339,6 +9413,9 @@ group_type group_classify(unsigned int imbalance_pct, + if (sgs->group_asym_packing) + return group_asym_packing; + ++ if (sgs->group_smt_balance) ++ return group_smt_balance; ++ + if (sgs->group_misfit_task_load) + return group_misfit_task; + +@@ -9349,98 +9426,128 @@ group_type group_classify(unsigned int imbalance_pct, } /** @@ -16491,31 +14038,31 @@ index 9671df93d1f5..9fe8288b1b1f 100644 * @sds: Load-balancing data with statistics of the local group * @sgs: Load-balancing statistics of the candidate busiest group - * @sg: The candidate busiest group -+ * @group: The candidate busiest group - * +- * - * Check the state of the SMT siblings of both @sds::local and @sg and decide - * if @dst_cpu can pull tasks. -+ * @env::dst_cpu can do asym_packing if it has higher priority than the -+ * preferred CPU of @group. ++ * @group: The candidate busiest group * - * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of - * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks - * only if @dst_cpu has higher priority. -+ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu -+ * can do asym_packing balance only if all its SMT siblings are idle. Also, it -+ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger -+ * imbalances in the number of CPUS are dealt with in find_busiest_group(). ++ * @env::dst_cpu can do asym_packing if it has higher priority than the ++ * preferred CPU of @group. * - * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more - * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority. - * Bigger imbalances in the number of busy CPUs will be dealt with in - * update_sd_pick_busiest(). -+ * If we are balancing load within an SMT core, or at DIE domain level, always -+ * proceed. ++ * SMT is a special case. If we are balancing load between cores, @env::dst_cpu ++ * can do asym_packing balance only if all its SMT siblings are idle. Also, it ++ * can only do it if @group is an SMT group and has exactly on busy CPU. Larger ++ * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings - * of @dst_cpu are idle and @sg has lower priority. -- * ++ * If we are balancing load within an SMT core, or at DIE domain level, always ++ * proceed. + * - * Return: true if @dst_cpu can pull tasks, false otherwise. + * Return: true if @env::dst_cpu can do with asym_packing load balance. False + * otherwise. @@ -16544,7 +14091,10 @@ index 9671df93d1f5..9fe8288b1b1f 100644 - */ - if (sg_busy_cpus >= 2) /* implies sg_is_smt */ - return true; -- ++ /* Ensure that the whole local core is idle, if applicable. */ ++ if (!sched_use_asym_prio(env->sd, env->dst_cpu)) ++ return false; + - /* - * @dst_cpu does not have SMT siblings. @sg may have SMT - * siblings and only one is busy. In such case, @dst_cpu @@ -16552,20 +14102,40 @@ index 9671df93d1f5..9fe8288b1b1f 100644 - * it has no running tasks). - */ - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); -- } -- ++ /* ++ * CPU priorities does not make sense for SMT cores with more than one ++ * busy sibling. ++ */ ++ if (group->flags & SD_SHARE_CPUCAPACITY) { ++ if (sgs->group_weight - sgs->idle_cpus != 1) ++ return false; + } + - /* @dst_cpu has SMT siblings. */ -- ++ return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); ++} + - if (sg_is_smt) { - int local_busy_cpus = sds->local->group_weight - - sds->local_stat.idle_cpus; - int busy_cpus_delta = sg_busy_cpus - local_busy_cpus; -- ++/* One group has more than one SMT CPU while the other group does not */ ++static inline bool smt_vs_nonsmt_groups(struct sched_group *sg1, ++ struct sched_group *sg2) ++{ ++ if (!sg1 || !sg2) ++ return false; + - if (busy_cpus_delta == 1) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); -- -+ /* Ensure that the whole local core is idle, if applicable. */ -+ if (!sched_use_asym_prio(env->sd, env->dst_cpu)) ++ return (sg1->flags & SD_SHARE_CPUCAPACITY) != ++ (sg2->flags & SD_SHARE_CPUCAPACITY); ++} + ++static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ if (env->idle == CPU_NOT_IDLE) return false; - } @@ -16573,35 +14143,88 @@ index 9671df93d1f5..9fe8288b1b1f 100644 - * @sg does not have SMT siblings. Ensure that @sds::local does not end - * up with more than one busy SMT sibling and only pull tasks if there - * are not busy CPUs (i.e., no CPU has running tasks). -+ * CPU priorities does not make sense for SMT cores with more than one -+ * busy sibling. ++ * For SMT source group, it is better to move a task ++ * to a CPU that doesn't have multiple tasks sharing its CPU capacity. ++ * Note that if a group has a single SMT, SD_SHARE_CPUCAPACITY ++ * will not be on. */ - if (!sds->local_stat.sum_nr_running) - return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu); -- -- return false; ++ if (group->flags & SD_SHARE_CPUCAPACITY && ++ sgs->sum_h_nr_running > 1) ++ return true; + + return false; -#else - /* Always return false so that callers deal with non-SMT cases. */ - return false; -#endif --} -- + } + -static inline bool -sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs, - struct sched_group *group) --{ ++static inline long sibling_imbalance(struct lb_env *env, ++ struct sd_lb_stats *sds, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *local) + { - /* Only do SMT checks if either local or candidate have SMT siblings */ - if ((sds->local->flags & SD_SHARE_CPUCAPACITY) || - (group->flags & SD_SHARE_CPUCAPACITY)) - return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group); -+ if (group->flags & SD_SHARE_CPUCAPACITY) { -+ if (sgs->group_weight - sgs->idle_cpus != 1) -+ return false; -+ } ++ int ncores_busiest, ncores_local; ++ long imbalance; - return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); +- return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu); ++ if (env->idle == CPU_NOT_IDLE || !busiest->sum_nr_running) ++ return 0; ++ ++ ncores_busiest = sds->busiest->cores; ++ ncores_local = sds->local->cores; ++ ++ if (ncores_busiest == ncores_local) { ++ imbalance = busiest->sum_nr_running; ++ lsub_positive(&imbalance, local->sum_nr_running); ++ return imbalance; ++ } ++ ++ /* Balance such that nr_running/ncores ratio are same on both groups */ ++ imbalance = ncores_local * busiest->sum_nr_running; ++ lsub_positive(&imbalance, ncores_busiest * local->sum_nr_running); ++ /* Normalize imbalance and do rounding on normalization */ ++ imbalance = 2 * imbalance + ncores_local + ncores_busiest; ++ imbalance /= ncores_local + ncores_busiest; ++ ++ /* Take advantage of resource in an empty sched group */ ++ if (imbalance == 0 && local->sum_nr_running == 0 && ++ busiest->sum_nr_running > 1) ++ imbalance = 2; ++ ++ return imbalance; } -@@ -9628,10 +9630,22 @@ static bool update_sd_pick_busiest(struct lb_env *env, + + static inline bool +@@ -9535,6 +9642,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, + sgs->group_asym_packing = 1; + } + ++ /* Check for loaded SMT group to be balanced to dst CPU */ ++ if (!local_group && smt_balance(env, sgs, group)) ++ sgs->group_smt_balance = 1; ++ + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + /* Computing avg_load makes sense only when group is overloaded */ +@@ -9619,6 +9730,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, + return false; + break; + ++ case group_smt_balance: + case group_fully_busy: + /* + * Select the fully busy group with highest avg_load. In +@@ -9628,13 +9740,37 @@ static bool update_sd_pick_busiest(struct lb_env *env, * contention when accessing shared HW resources. * * XXX for now avg_load is not computed and always 0 so we @@ -16626,7 +14249,38 @@ index 9671df93d1f5..9fe8288b1b1f 100644 break; case group_has_spare: -@@ -10106,7 +10120,6 @@ static void update_idle_cpu_scan(struct lb_env *env, ++ /* ++ * Do not pick sg with SMT CPUs over sg with pure CPUs, ++ * as we do not want to pull task off SMT core with one task ++ * and make the core idle. ++ */ ++ if (smt_vs_nonsmt_groups(sds->busiest, sg)) { ++ if (sg->flags & SD_SHARE_CPUCAPACITY && sgs->sum_h_nr_running <= 1) ++ return false; ++ else ++ return true; ++ } ++ + /* + * Select not overloaded group with lowest number of idle cpus + * and highest number of running tasks. We could also compare +@@ -9831,6 +9967,7 @@ static bool update_pick_idlest(struct sched_group *idlest, + + case group_imbalanced: + case group_asym_packing: ++ case group_smt_balance: + /* Those types are not used in the slow wakeup path */ + return false; + +@@ -9962,6 +10099,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) + + case group_imbalanced: + case group_asym_packing: ++ case group_smt_balance: + /* Those type are not used in the slow wakeup path */ + return NULL; + +@@ -10106,7 +10244,6 @@ static void update_idle_cpu_scan(struct lb_env *env, static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds) { @@ -16634,7 +14288,7 @@ index 9671df93d1f5..9fe8288b1b1f 100644 struct sched_group *sg = env->sd->groups; struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; -@@ -10147,8 +10160,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd +@@ -10147,8 +10284,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd sg = sg->next; } while (sg != env->sd->groups); @@ -16650,7 +14304,37 @@ index 9671df93d1f5..9fe8288b1b1f 100644 if (env->sd->flags & SD_NUMA) -@@ -10458,7 +10476,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env) +@@ -10212,6 +10354,13 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++ if (busiest->group_type == group_smt_balance) { ++ /* Reduce number of tasks sharing CPU capacity */ ++ env->migration_type = migrate_task; ++ env->imbalance = 1; ++ return; ++ } ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -10259,14 +10408,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + } + + if (busiest->group_weight == 1 || sds->prefer_sibling) { +- unsigned int nr_diff = busiest->sum_nr_running; + /* + * When prefer sibling, evenly spread running tasks on + * groups. + */ + env->migration_type = migrate_task; +- lsub_positive(&nr_diff, local->sum_nr_running); +- env->imbalance = nr_diff; ++ env->imbalance = sibling_imbalance(env, sds, busiest, local); + } else { + + /* +@@ -10458,22 +10605,32 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto out_balanced; } @@ -16660,9 +14344,50 @@ index 9671df93d1f5..9fe8288b1b1f 100644 + * group's child domain. + */ if (sds.prefer_sibling && local->group_type == group_has_spare && - busiest->sum_nr_running > local->sum_nr_running + 1) +- busiest->sum_nr_running > local->sum_nr_running + 1) ++ sibling_imbalance(env, &sds, busiest, local) > 1) goto force_balance; -@@ -10560,8 +10581,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, + + if (busiest->group_type != group_overloaded) { +- if (env->idle == CPU_NOT_IDLE) ++ if (env->idle == CPU_NOT_IDLE) { + /* + * If the busiest group is not overloaded (and as a + * result the local one too) but this CPU is already + * busy, let another idle CPU try to pull task. + */ + goto out_balanced; ++ } ++ ++ if (busiest->group_type == group_smt_balance && ++ smt_vs_nonsmt_groups(sds.local, sds.busiest)) { ++ /* Let non SMT CPU pull from SMT CPU sharing with sibling */ ++ goto force_balance; ++ } + + if (busiest->group_weight > 1 && +- local->idle_cpus <= (busiest->idle_cpus + 1)) ++ local->idle_cpus <= (busiest->idle_cpus + 1)) { + /* + * If the busiest group is not overloaded + * and there is no imbalance between this and busiest +@@ -10484,12 +10641,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env) + * there is more than 1 CPU per group. + */ + goto out_balanced; ++ } + +- if (busiest->sum_h_nr_running == 1) ++ if (busiest->sum_h_nr_running == 1) { + /* + * busiest doesn't have any tasks waiting to run + */ + goto out_balanced; ++ } + } + + force_balance: +@@ -10560,8 +10719,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, nr_running == 1) continue; @@ -16679,7 +14404,7 @@ index 9671df93d1f5..9fe8288b1b1f 100644 sched_asym_prefer(i, env->dst_cpu) && nr_running == 1) continue; -@@ -10650,12 +10678,19 @@ static inline bool +@@ -10650,12 +10816,19 @@ static inline bool asym_active_balance(struct lb_env *env) { /* @@ -16703,7 +14428,41 @@ index 9671df93d1f5..9fe8288b1b1f 100644 } static inline bool -@@ -10762,7 +10797,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, +@@ -10709,7 +10882,7 @@ static int active_load_balance_cpu_stop(void *data); + static int should_we_balance(struct lb_env *env) + { + struct sched_group *sg = env->sd->groups; +- int cpu; ++ int cpu, idle_smt = -1; + + /* + * Ensure the balancing environment is consistent; can happen +@@ -10736,10 +10909,24 @@ static int should_we_balance(struct lb_env *env) + if (!idle_cpu(cpu)) + continue; + ++ /* ++ * Don't balance to idle SMT in busy core right away when ++ * balancing cores, but remember the first idle SMT CPU for ++ * later consideration. Find CPU on an idle core first. ++ */ ++ if (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && !is_core_idle(cpu)) { ++ if (idle_smt == -1) ++ idle_smt = cpu; ++ continue; ++ } ++ + /* Are we the first idle CPU? */ + return cpu == env->dst_cpu; + } + ++ if (idle_smt == env->dst_cpu) ++ return true; ++ + /* Are we the first CPU of this group ? */ + return group_balance_cpu(sg) == env->dst_cpu; + } +@@ -10762,7 +10949,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, @@ -16712,7 +14471,7 @@ index 9671df93d1f5..9fe8288b1b1f 100644 .idle = idle, .loop_break = SCHED_NR_MIGRATE_BREAK, .cpus = cpus, -@@ -11389,9 +11424,13 @@ static void nohz_balancer_kick(struct rq *rq) +@@ -11389,9 +11576,13 @@ static void nohz_balancer_kick(struct rq *rq) * When ASYM_PACKING; see if there's a more preferred CPU * currently idle; in which case, kick the ILB to move tasks * around. @@ -16740,9 +14499,18 @@ index ee7f23c76bd3..9e390eb82e38 100644 /* * Issue a WARN when we do multiple update_rq_clock() calls diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c -index e072f6b31bf3..81fca77397f6 100644 +index e072f6b31bf3..2ccb0b2ebd78 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c +@@ -140,7 +140,7 @@ + static int psi_bug __read_mostly; + + DEFINE_STATIC_KEY_FALSE(psi_disabled); +-DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); ++static DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); + + #ifdef CONFIG_PSI_DEFAULT_DISABLED + static bool psi_enable; @@ -160,7 +160,6 @@ __setup("psi=", setup_psi); #define EXP_300s 2034 /* 1/exp(2s/300s) */ @@ -16784,7 +14552,7 @@ index e072f6b31bf3..81fca77397f6 100644 if (group->rtpoll_states == 0) { group->rtpoll_until = 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index ec7b3e0a2b20..d8ba81c66579 100644 +index ec7b3e0a2b20..0605fb53816d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -286,12 +286,6 @@ struct rt_bandwidth { @@ -16800,7 +14568,16 @@ index ec7b3e0a2b20..d8ba81c66579 100644 static inline int dl_bandwidth_enabled(void) { return sysctl_sched_rt_runtime >= 0; -@@ -1772,6 +1766,13 @@ queue_balance_callback(struct rq *rq, +@@ -642,6 +636,8 @@ struct cfs_rq { + u64 throttled_clock; + u64 throttled_clock_pelt; + u64 throttled_clock_pelt_time; ++ u64 throttled_clock_self; ++ u64 throttled_clock_self_time; + int throttled; + int throttle_count; + struct list_head throttled_list; +@@ -1772,6 +1768,13 @@ queue_balance_callback(struct rq *rq, for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ __sd; __sd = __sd->parent) @@ -16814,7 +14591,7 @@ index ec7b3e0a2b20..d8ba81c66579 100644 /** * highest_flag_domain - Return highest sched_domain containing flag. * @cpu: The CPU whose highest level of sched domain is to -@@ -1779,16 +1780,25 @@ queue_balance_callback(struct rq *rq, +@@ -1779,16 +1782,25 @@ queue_balance_callback(struct rq *rq, * @flag: The flag to check for the highest sched_domain * for the given CPU. * @@ -16843,7 +14620,15 @@ index ec7b3e0a2b20..d8ba81c66579 100644 } return hsd; -@@ -2378,7 +2388,6 @@ extern struct rt_bandwidth def_rt_bandwidth; +@@ -1844,6 +1856,7 @@ struct sched_group { + atomic_t ref; + + unsigned int group_weight; ++ unsigned int cores; + struct sched_group_capacity *sgc; + int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; +@@ -2378,7 +2391,6 @@ extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); @@ -16852,7 +14637,7 @@ index ec7b3e0a2b20..d8ba81c66579 100644 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 6682535e37c8..ca4472281c28 100644 +index 6682535e37c8..8c7c0b64e615 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -719,8 +719,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) @@ -16870,13 +14655,54 @@ index 6682535e37c8..ca4472281c28 100644 /* * Transfer SD_PREFER_SIBLING down in case of a * degenerate parent; the spans match for this +@@ -1270,14 +1275,26 @@ build_sched_groups(struct sched_domain *sd, int cpu) + static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + { + struct sched_group *sg = sd->groups; ++ struct cpumask *mask = sched_domains_tmpmask2; + + WARN_ON(!sg); + + do { +- int cpu, max_cpu = -1; ++ int cpu, cores = 0, max_cpu = -1; + + sg->group_weight = cpumask_weight(sched_group_span(sg)); + ++ cpumask_copy(mask, sched_group_span(sg)); ++ for_each_cpu(cpu, mask) { ++ cores++; ++#ifdef CONFIG_SCHED_SMT ++ cpumask_andnot(mask, mask, cpu_smt_mask(cpu)); ++#else ++ __cpumask_clear_cpu(cpu, mask); ++#endif ++ } ++ sg->cores = cores; ++ + if (!(sd->flags & SD_ASYM_PACKING)) + goto next; + +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 1b725510dd0f..a5758661875c 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -630,7 +630,7 @@ static inline void tick_irq_exit(void) + int cpu = smp_processor_id(); + + /* Make sure that timer wheel updates are propagated */ +- if ((idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { ++ if ((sched_core_idle_cpu(cpu) && !need_resched()) || tick_nohz_full_cpu(cpu)) { + if (!in_hardirq()) + tick_nohz_irq_exit(); + } -- 2.41.0 -From ad2d7caffe845c4f984702220297e98accf7d4fb Mon Sep 17 00:00:00 2001 +From da924e0790a68d55b9e03f6892a1cd82c98b660b Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Tue, 9 May 2023 18:40:05 +0200 -Subject: [PATCH 8/8] zstd 1.5.5 +Date: Tue, 11 Jul 2023 19:26:20 +0200 +Subject: [PATCH 7/7] zstd 1.5.5 Signed-off-by: Peter Jung --- diff --git a/patches/0002-eevdf.patch b/patches/0002-eevdf.patch index ae65e0d..63175c9 100644 --- a/patches/0002-eevdf.patch +++ b/patches/0002-eevdf.patch @@ -1,6 +1,6 @@ -From 5e4ded34523fcaf5aea5c77d45239b6dd33f1c91 Mon Sep 17 00:00:00 2001 +From d5ebb5aa8f44f2a81002becad5f85b6e70801575 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Thu, 1 Jun 2023 16:37:55 +0200 +Date: Tue, 11 Jul 2023 19:27:06 +0200 Subject: [PATCH] EEVDF Signed-off-by: Peter Jung @@ -13,14 +13,14 @@ Signed-off-by: Peter Jung init/init_task.c | 3 +- kernel/sched/core.c | 65 +- kernel/sched/debug.c | 49 +- - kernel/sched/fair.c | 1152 +++++++++++------------ + kernel/sched/fair.c | 1157 +++++++++++------------ kernel/sched/features.h | 24 +- kernel/sched/sched.h | 22 +- tools/include/uapi/linux/sched.h | 4 +- - 12 files changed, 726 insertions(+), 660 deletions(-) + 12 files changed, 733 insertions(+), 658 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst -index f67c0829350b..a39dfda3d032 100644 +index e592a9364473..c826ab4e2e1a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1121,6 +1121,16 @@ All time durations are in microseconds. @@ -78,7 +78,7 @@ index 7ee7ed5de722..6dbc5a1bf6a8 100644 * Template for declaring augmented rbtree callbacks (generic case) * diff --git a/include/linux/sched.h b/include/linux/sched.h -index eed5d65b8d1f..63ac38d66ec6 100644 +index 8473324705ca..88c3e7ba8992 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -550,13 +550,18 @@ struct sched_entity { @@ -196,7 +196,7 @@ index ff6c4b9bfe6b..511cbcf3510d 100644 .rt = { .run_list = LIST_HEAD_INIT(init_task.rt.run_list), diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index bcb3a7e684ca..3bcb77b00e5b 100644 +index 1b971c69d3a2..df2f22a9729c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1305,6 +1305,12 @@ static void set_load_weight(struct task_struct *p, bool update_load) @@ -232,7 +232,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 /* * We don't need the reset flag anymore after the fork. It has -@@ -7512,7 +7522,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) +@@ -7525,7 +7535,7 @@ static struct task_struct *find_process_by_pid(pid_t pid) #define SETPARAM_POLICY -1 static void __setscheduler_params(struct task_struct *p, @@ -241,7 +241,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 { int policy = attr->sched_policy; -@@ -7536,6 +7546,13 @@ static void __setscheduler_params(struct task_struct *p, +@@ -7549,6 +7559,13 @@ static void __setscheduler_params(struct task_struct *p, set_load_weight(p, true); } @@ -255,7 +255,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 /* * Check the target process has a UID that matches the current process's: */ -@@ -7676,6 +7693,13 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7682,6 +7699,13 @@ static int __sched_setscheduler(struct task_struct *p, return retval; } @@ -266,10 +266,10 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 + return -EINVAL; + } + - if (pi) - cpuset_read_lock(); - -@@ -7710,6 +7734,9 @@ static int __sched_setscheduler(struct task_struct *p, + /* Update task specific "requested" clamps */ + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { + retval = uclamp_validate(p, attr); +@@ -7723,6 +7747,9 @@ static int __sched_setscheduler(struct task_struct *p, goto change; if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) goto change; @@ -279,7 +279,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 p->sched_reset_on_fork = reset_on_fork; retval = 0; -@@ -7798,6 +7825,7 @@ static int __sched_setscheduler(struct task_struct *p, +@@ -7811,6 +7838,7 @@ static int __sched_setscheduler(struct task_struct *p, __setscheduler_params(p, attr); __setscheduler_prio(p, newprio); } @@ -287,7 +287,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 __setscheduler_uclamp(p, attr); if (queued) { -@@ -8008,6 +8036,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a +@@ -8021,6 +8049,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a size < SCHED_ATTR_SIZE_VER1) return -EINVAL; @@ -297,7 +297,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 /* * XXX: Do we want to be lenient like existing syscalls; or do we want * to be strict and return an error on out-of-bounds values? -@@ -8245,6 +8276,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, +@@ -8258,6 +8289,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, get_params(p, &kattr); kattr.sched_flags &= SCHED_FLAG_ALL; @@ -306,7 +306,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 #ifdef CONFIG_UCLAMP_TASK /* * This could race with another potential updater, but this is fine -@@ -11181,6 +11214,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, +@@ -11215,6 +11248,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, { return sched_group_set_idle(css_tg(css), idle); } @@ -332,7 +332,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 #endif static struct cftype cpu_legacy_files[] = { -@@ -11195,6 +11247,11 @@ static struct cftype cpu_legacy_files[] = { +@@ -11229,6 +11281,11 @@ static struct cftype cpu_legacy_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -344,7 +344,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 #endif #ifdef CONFIG_CFS_BANDWIDTH { -@@ -11412,6 +11469,12 @@ static struct cftype cpu_files[] = { +@@ -11468,6 +11525,12 @@ static struct cftype cpu_files[] = { .read_s64 = cpu_idle_read_s64, .write_s64 = cpu_idle_write_s64, }, @@ -358,7 +358,7 @@ index bcb3a7e684ca..3bcb77b00e5b 100644 #ifdef CONFIG_CFS_BANDWIDTH { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 066ff1c8ae4e..e7e83181fbb6 100644 +index aeeba46a096b..5c743bcb340d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -347,10 +347,7 @@ static __init int sched_init_debug(void) @@ -373,7 +373,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -581,9 +578,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -582,9 +579,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) else SEQ_printf(m, " %c", task_state_to_char(p)); @@ -388,7 +388,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 (long long)(p->nvcsw + p->nivcsw), p->prio); -@@ -626,10 +627,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +@@ -627,10 +628,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { @@ -401,7 +401,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 unsigned long flags; #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -643,26 +643,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +@@ -644,26 +644,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); @@ -441,7 +441,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); -@@ -863,10 +862,7 @@ static void sched_debug_header(struct seq_file *m) +@@ -864,10 +863,7 @@ static void sched_debug_header(struct seq_file *m) SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) @@ -453,7 +453,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN -@@ -1089,6 +1085,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -1090,6 +1086,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, #endif P(policy); P(prio); @@ -462,7 +462,7 @@ index 066ff1c8ae4e..e7e83181fbb6 100644 P(dl.runtime); P(dl.deadline); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 9fe8288b1b1f..97678b9b4023 100644 +index 64cbea29b007..36dcf4770830 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -47,6 +47,7 @@ @@ -908,8 +908,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 + + while (node) { + struct sched_entity *se = __node_2_se(node); - -- return __node_2_se(next); ++ + /* + * If this entity is not eligible, try the left subtree. + */ @@ -928,7 +927,8 @@ index 9fe8288b1b1f..97678b9b4023 100644 + if (best->deadline == best->min_deadline) + break; + } -+ + +- return __node_2_se(next); + /* + * If the earlest deadline in this subtree is in the fully + * eligible left half of our space, go there. @@ -984,12 +984,12 @@ index 9fe8288b1b1f..97678b9b4023 100644 { - if (unlikely(se->load.weight != NICE_0_LOAD)) - delta = __calc_delta(delta, NICE_0_LOAD, &se->load); +- +- return delta; +-} + u32 weight = sched_prio_to_weight[prio]; + u64 base = sysctl_sched_base_slice; -- return delta; --} -- -/* - * The idea is to set a period in which each task runs once. - * @@ -1149,7 +1149,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } void reweight_task(struct task_struct *p, int prio) -@@ -4710,158 +4918,151 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} +@@ -4710,98 +4918,140 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -1168,11 +1168,12 @@ index 9fe8288b1b1f..97678b9b4023 100644 - -static inline bool entity_is_long_sleeper(struct sched_entity *se) +static inline bool -+entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) ++entity_has_slept(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 vslice, int flags) { - struct cfs_rq *cfs_rq; - u64 sleep_time; -+ u64 now; ++ u64 now, vdelta; ++ s64 delta; - if (se->exec_start == 0) + if (!(flags & ENQUEUE_WAKEUP)) @@ -1181,19 +1182,25 @@ index 9fe8288b1b1f..97678b9b4023 100644 - cfs_rq = cfs_rq_of(se); - - sleep_time = rq_clock_task(rq_of(cfs_rq)); -- ++ if (flags & ENQUEUE_MIGRATED) ++ return true; + - /* Happen while migrating because of clock task divergence */ - if (sleep_time <= se->exec_start) -- return false; -- ++ now = rq_clock_task(rq_of(cfs_rq)); ++ delta = now - se->exec_start; ++ if (delta < 0) + return false; + - sleep_time -= se->exec_start; - if (sleep_time > ((1ULL << 63) / scale_load_down(NICE_0_LOAD))) -+ if (flags & ENQUEUE_MIGRATED) - return true; +- return true; ++ vdelta = __calc_delta(delta, NICE_0_LOAD, &cfs_rq->load); ++ if (vdelta < vslice) ++ return false; - return false; -+ now = rq_clock_task(rq_of(cfs_rq)); -+ return (s64)(se->exec_start - now) >= se->slice; ++ return true; } static void @@ -1239,7 +1246,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 + */ + if (sched_feat(PLACE_FUDGE) && + (cfs_rq->avg_slice > se->slice * cfs_rq->avg_load) && -+ entity_has_slept(cfs_rq, se, flags)) { ++ entity_has_slept(cfs_rq, se, vslice, flags)) { + lag += vslice; + if (lag > 0) + lag = 0; @@ -1355,6 +1362,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +@@ -4809,60 +5059,20 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); static inline bool cfs_bandwidth_used(void); @@ -1417,7 +1425,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 /* * When enqueuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. -@@ -4873,18 +5074,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4874,18 +5084,28 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) */ update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH); se_update_runnable(se); @@ -1449,7 +1457,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; -@@ -4896,17 +5107,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4907,17 +5127,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) } } @@ -1467,7 +1475,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 static void __clear_buddies_next(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -4918,27 +5118,10 @@ static void __clear_buddies_next(struct sched_entity *se) +@@ -4929,27 +5138,10 @@ static void __clear_buddies_next(struct sched_entity *se) } } @@ -1495,7 +1503,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -4972,20 +5155,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -4983,20 +5175,12 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) clear_buddies(cfs_rq, se); @@ -1517,7 +1525,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); -@@ -5004,52 +5179,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5015,52 +5199,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_idle_cfs_rq_clock_pelt(cfs_rq); } @@ -1570,7 +1578,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { -@@ -5088,9 +5217,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5099,9 +5237,6 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -1580,7 +1588,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5101,50 +5227,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +@@ -5112,50 +5247,14 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { @@ -1611,7 +1619,10 @@ index 9fe8288b1b1f..97678b9b4023 100644 - if (!second || (curr && entity_before(curr, second))) - second = curr; - } -- ++ if (sched_feat(NEXT_BUDDY) && ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ return cfs_rq->next; + - if (second && wakeup_preempt_entity(second, left) < 1) - se = second; - } @@ -1627,16 +1638,13 @@ index 9fe8288b1b1f..97678b9b4023 100644 - */ - se = cfs_rq->last; - } -+ if (sched_feat(NEXT_BUDDY) && -+ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) -+ return cfs_rq->next; - +- - return se; + return pick_eevdf(cfs_rq); } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -5161,8 +5251,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +@@ -5172,8 +5271,6 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); @@ -1645,7 +1653,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 if (prev->on_rq) { update_stats_wait_start_fair(cfs_rq, prev); /* Put 'current' back into the tree. */ -@@ -5203,9 +5291,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +@@ -5214,9 +5311,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif @@ -1655,7 +1663,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } -@@ -6210,13 +6295,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} +@@ -6241,13 +6335,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { struct sched_entity *se = &p->se; @@ -1670,7 +1678,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 s64 delta = slice - ran; if (delta < 0) { -@@ -6240,8 +6324,7 @@ static void hrtick_update(struct rq *rq) +@@ -6271,8 +6364,7 @@ static void hrtick_update(struct rq *rq) if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; @@ -1680,7 +1688,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } #else /* !CONFIG_SCHED_HRTICK */ static inline void -@@ -6282,17 +6365,6 @@ static int sched_idle_rq(struct rq *rq) +@@ -6313,17 +6405,6 @@ static int sched_idle_rq(struct rq *rq) rq->nr_running); } @@ -1698,7 +1706,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 #ifdef CONFIG_SMP static int sched_idle_cpu(int cpu) { -@@ -7778,18 +7850,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) +@@ -7809,18 +7890,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) { struct sched_entity *se = &p->se; @@ -1717,7 +1725,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 if (!task_on_rq_migrating(p)) { remove_entity_load_avg(se); -@@ -7827,66 +7887,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +@@ -7858,66 +7927,6 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } #endif /* CONFIG_SMP */ @@ -1784,7 +1792,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { -@@ -7898,12 +7898,6 @@ static void set_next_buddy(struct sched_entity *se) +@@ -7929,12 +7938,6 @@ static void set_next_buddy(struct sched_entity *se) } } @@ -1797,7 +1805,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 /* * Preempt the current task with a newly woken task if needed: */ -@@ -7912,7 +7906,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7943,7 +7946,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(curr); @@ -1805,7 +1813,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 int next_buddy_marked = 0; int cse_is_idle, pse_is_idle; -@@ -7928,7 +7921,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -7959,7 +7961,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; @@ -1814,7 +1822,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 set_next_buddy(pse); next_buddy_marked = 1; } -@@ -7973,35 +7966,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -8004,35 +8006,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ if (cse_is_idle != pse_is_idle) return; @@ -1857,7 +1865,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } #ifdef CONFIG_SMP -@@ -8202,8 +8179,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +@@ -8233,8 +8219,6 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) /* * sched_yield() is very simple @@ -1866,7 +1874,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 */ static void yield_task_fair(struct rq *rq) { -@@ -8219,21 +8194,19 @@ static void yield_task_fair(struct rq *rq) +@@ -8250,21 +8234,19 @@ static void yield_task_fair(struct rq *rq) clear_buddies(cfs_rq, se); @@ -1900,7 +1908,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) -@@ -8476,8 +8449,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) +@@ -8512,8 +8494,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) * Buddy candidates are cache hot: */ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && @@ -1910,7 +1918,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 return 1; if (sysctl_sched_migration_cost == -1) -@@ -11987,8 +11959,8 @@ static void rq_offline_fair(struct rq *rq) +@@ -12139,8 +12120,8 @@ static void rq_offline_fair(struct rq *rq) static inline bool __entity_slice_used(struct sched_entity *se, int min_nr_tasks) { @@ -1920,7 +1928,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 return (rtime * min_nr_tasks > slice); } -@@ -12144,8 +12116,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +@@ -12296,8 +12277,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { @@ -1930,7 +1938,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 struct rq *rq = this_rq(); struct rq_flags rf; -@@ -12154,22 +12126,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12306,22 +12287,9 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; @@ -1955,7 +1963,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 rq_unlock(rq, &rf); } -@@ -12198,34 +12157,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +@@ -12350,34 +12318,6 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) check_preempt_curr(rq, p, 0); } @@ -1990,7 +1998,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 #ifdef CONFIG_FAIR_GROUP_SCHED /* * Propagate the changes of the sched_entity across the tg tree to make it -@@ -12296,16 +12227,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) +@@ -12448,16 +12388,6 @@ static void attach_entity_cfs_rq(struct sched_entity *se) static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2007,7 +2015,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 detach_entity_cfs_rq(se); } -@@ -12313,12 +12234,8 @@ static void detach_task_cfs_rq(struct task_struct *p) +@@ -12465,12 +12395,8 @@ static void detach_task_cfs_rq(struct task_struct *p) static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; @@ -2020,7 +2028,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 } static void switched_from_fair(struct rq *rq, struct task_struct *p) -@@ -12429,6 +12346,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +@@ -12581,6 +12507,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) goto err; tg->shares = NICE_0_LOAD; @@ -2028,7 +2036,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 init_cfs_bandwidth(tg_cfs_bandwidth(tg)); -@@ -12527,6 +12445,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, +@@ -12679,6 +12606,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, } se->my_q = cfs_rq; @@ -2038,7 +2046,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; -@@ -12657,6 +12578,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) +@@ -12809,6 +12739,29 @@ int sched_group_set_idle(struct task_group *tg, long idle) return 0; } @@ -2068,7 +2076,7 @@ index 9fe8288b1b1f..97678b9b4023 100644 #else /* CONFIG_FAIR_GROUP_SCHED */ void free_fair_sched_group(struct task_group *tg) { } -@@ -12683,7 +12627,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task +@@ -12835,7 +12788,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task * idle runqueue: */ if (rq->cfs.load.weight) @@ -2125,7 +2133,7 @@ index 9e390eb82e38..ca95044a7479 100644 -SCHED_FEAT(ALT_PERIOD, true) -SCHED_FEAT(BASE_SLICE, true) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index d8ba81c66579..0ea13cfac95b 100644 +index 0605fb53816d..96b1ae519f20 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -372,6 +372,8 @@ struct task_group { @@ -2166,7 +2174,7 @@ index d8ba81c66579..0ea13cfac95b 100644 #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; -@@ -2167,6 +2173,7 @@ extern const u32 sched_prio_to_wmult[40]; +@@ -2170,6 +2176,7 @@ extern const u32 sched_prio_to_wmult[40]; #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -2174,7 +2182,7 @@ index d8ba81c66579..0ea13cfac95b 100644 #define RETRY_TASK ((void *)-1UL) -@@ -2471,11 +2478,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +@@ -2474,11 +2481,9 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; @@ -2188,7 +2196,7 @@ index d8ba81c66579..0ea13cfac95b 100644 extern int sysctl_resched_latency_warn_ms; extern int sysctl_resched_latency_warn_once; -@@ -2488,6 +2493,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; +@@ -2491,6 +2496,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_numa_balancing_hot_threshold; #endif @@ -2197,7 +2205,7 @@ index d8ba81c66579..0ea13cfac95b 100644 #ifdef CONFIG_SCHED_HRTICK /* -@@ -3496,4 +3503,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } +@@ -3499,4 +3506,7 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } static inline void init_sched_mm_cid(struct task_struct *t) { } #endif diff --git a/patches/0002-eevdfbore.patch b/patches/0002-eevdfbore.patch index 0684d8b..e9c6776 100644 --- a/patches/0002-eevdfbore.patch +++ b/patches/0002-eevdfbore.patch @@ -1,9 +1,9 @@ -From 5e3bbb489086974a823af55f23cc17d2ea032f8b Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Sun, 18 Jun 2023 11:05:43 +0200 +From 4a346951e2b3c7de65511c95f74fdd7197e3d2e5 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Tue, 11 Jul 2023 19:31:15 +0200 Subject: [PATCH] bore-eevdf -Signed-off-by: Piotr Gorski +Signed-off-by: Peter Jung --- include/linux/sched.h | 10 +++ init/Kconfig | 20 +++++ @@ -15,7 +15,7 @@ Signed-off-by: Piotr Gorski 7 files changed, 286 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index 63ac38d66..63a2205a5 100644 +index 88c3e7ba8992..6b4c553aea75 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -560,6 +560,12 @@ struct sched_entity { @@ -43,10 +43,10 @@ index 63ac38d66..63a2205a5 100644 /* * 'ptraced' is the list of tasks this task is using ptrace() on. diff --git a/init/Kconfig b/init/Kconfig -index 0147b4a33..4ab7e154b 100644 +index b6d38eccca10..e90546df3182 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1290,6 +1290,26 @@ config CHECKPOINT_RESTORE +@@ -1277,6 +1277,26 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -74,7 +74,7 @@ index 0147b4a33..4ab7e154b 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 3bcb77b00..65469bc43 100644 +index df2f22a9729c..4995243a2ba4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4490,6 +4490,57 @@ int wake_up_state(struct task_struct *p, unsigned int state) @@ -118,11 +118,11 @@ index 3bcb77b00..65469bc43 100644 +static void update_task_initial_burst_time(struct task_struct *task) { + struct sched_entity *se = &task->se; + struct task_struct *par = task->real_parent; -+ u64 ktime = ktime_to_ns(ktime_get()); ++ u64 now = ktime_get_ns(); + + if (likely(par)) { -+ if (par->child_burst_last_cached + sched_burst_cache_lifetime < ktime) { -+ par->child_burst_last_cached = ktime; ++ if (par->child_burst_last_cached + sched_burst_cache_lifetime < now) { ++ par->child_burst_last_cached = now; + update_task_child_burst_time_cache(par); + } + se->prev_burst_time = max(se->prev_burst_time, par->child_burst_cache); @@ -155,20 +155,20 @@ index 3bcb77b00..65469bc43 100644 /* * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external -@@ -9955,6 +10012,11 @@ void __init sched_init(void) +@@ -9968,6 +10025,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.4.1 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 2.4.2 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index e7e83181f..c29500314 100644 +index 5c743bcb340d..755ef4c8d34b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -348,6 +348,7 @@ static __init int sched_init_debug(void) @@ -179,7 +179,7 @@ index e7e83181f..c29500314 100644 debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); -@@ -594,6 +595,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -595,6 +596,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); @@ -190,7 +190,7 @@ index e7e83181f..c29500314 100644 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 97678b9b4..c3d632800 100644 +index 36dcf4770830..30080b227866 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -247,7 +247,7 @@ index 97678b9b4..c3d632800 100644 +#ifdef CONFIG_SCHED_BORE +unsigned int __read_mostly sched_bore = 1; +unsigned int __read_mostly sched_burst_cache_lifetime = 15000000; -+unsigned int __read_mostly sched_burst_penalty_offset = 12; ++unsigned int __read_mostly sched_burst_penalty_offset = 18; +unsigned int __read_mostly sched_burst_penalty_scale = 1292; +unsigned int __read_mostly sched_burst_smoothness = 1; +static int three = 3; @@ -414,7 +414,7 @@ index 97678b9b4..c3d632800 100644 curr->vruntime += calc_delta_fair(delta_exec, curr); update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -5217,6 +5351,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5237,6 +5371,9 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -424,7 +424,7 @@ index 97678b9b4..c3d632800 100644 /* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups -@@ -5227,14 +5364,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +@@ -5247,14 +5384,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static struct sched_entity * pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) { @@ -443,7 +443,7 @@ index 97678b9b4..c3d632800 100644 } static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); -@@ -6464,6 +6603,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6504,6 +6643,38 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } @@ -482,7 +482,7 @@ index 97678b9b4..c3d632800 100644 static void set_next_buddy(struct sched_entity *se); /* -@@ -6482,6 +6653,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6522,6 +6693,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -492,7 +492,7 @@ index 97678b9b4..c3d632800 100644 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); -@@ -7972,7 +8146,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ +@@ -8012,7 +8186,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * XXX pick_eevdf(cfs_rq) != se ? */ @@ -501,7 +501,7 @@ index 97678b9b4..c3d632800 100644 goto preempt; return; -@@ -8185,6 +8359,9 @@ static void yield_task_fair(struct rq *rq) +@@ -8225,6 +8399,9 @@ static void yield_task_fair(struct rq *rq) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -512,7 +512,7 @@ index 97678b9b4..c3d632800 100644 /* * Are we the only task in the tree? diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ca95044a7..a7d34d1b2 100644 +index ca95044a7479..a7d34d1b28c5 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -13,7 +13,11 @@ SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) @@ -528,10 +528,10 @@ index ca95044a7..a7d34d1b2 100644 /* * Consider buddies to be cache hot, decreases the likeliness of a diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 0ea13cfac..34cb2fbbb 100644 +index 96b1ae519f20..cc0a17fb23c2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2479,6 +2479,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; +@@ -2482,6 +2482,7 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; diff --git a/patches/0003-bcachefs.patch b/patches/0003-bcachefs.patch index ec764b6..58bd87c 100644 --- a/patches/0003-bcachefs.patch +++ b/patches/0003-bcachefs.patch @@ -1,128 +1,142 @@ -From a14a7b200d823d9ba18f1dbae21705e8a3e88f22 Mon Sep 17 00:00:00 2001 +From 53d26f1e843c6117e14bf9d0b41ca7f986f4ff5b Mon Sep 17 00:00:00 2001 From: Piotr Gorski -Date: Thu, 29 Jun 2023 08:18:38 +0200 +Date: Sun, 16 Jul 2023 11:24:25 +0200 Subject: [PATCH] bcachefs Signed-off-by: Piotr Gorski --- - MAINTAINERS | 39 + + Documentation/admin-guide/sysctl/vm.rst | 16 + + Documentation/filesystems/proc.rst | 28 + + MAINTAINERS | 55 + + arch/arm64/include/asm/spectre.h | 4 +- + arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- + arch/x86/kernel/amd_gart_64.c | 2 +- block/bdev.c | 2 +- block/bio.c | 18 +- block/blk-core.c | 1 + block/blk.h | 1 - + drivers/block/virtio_blk.c | 4 +- + drivers/gpu/drm/gud/gud_drv.c | 2 +- + drivers/iommu/dma-iommu.c | 2 +- drivers/md/bcache/Kconfig | 10 +- drivers/md/bcache/Makefile | 4 +- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 1 - drivers/md/bcache/util.h | 3 +- + drivers/mmc/core/block.c | 4 +- + drivers/mtd/spi-nor/debugfs.c | 6 +- + .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 4 +- + drivers/scsi/sd.c | 8 +- + drivers/xen/grant-dma-ops.c | 2 +- + drivers/xen/swiotlb-xen.c | 2 +- fs/Kconfig | 1 + fs/Makefile | 1 + fs/aio.c | 70 +- - fs/bcachefs/Kconfig | 75 + + fs/bcachefs/Kconfig | 77 + fs/bcachefs/Makefile | 74 + - fs/bcachefs/acl.c | 414 ++ + fs/bcachefs/acl.c | 412 ++ fs/bcachefs/acl.h | 58 + - fs/bcachefs/alloc_background.c | 2218 +++++++++ - fs/bcachefs/alloc_background.h | 251 ++ - fs/bcachefs/alloc_foreground.c | 1535 +++++++ + fs/bcachefs/alloc_background.c | 2209 +++++++++ + fs/bcachefs/alloc_background.h | 257 ++ + fs/bcachefs/alloc_foreground.c | 1536 +++++++ fs/bcachefs/alloc_foreground.h | 224 + fs/bcachefs/alloc_types.h | 126 + - fs/bcachefs/backpointers.c | 886 ++++ + fs/bcachefs/backpointers.c | 889 ++++ fs/bcachefs/backpointers.h | 131 + fs/bcachefs/bbpos.h | 48 + - fs/bcachefs/bcachefs.h | 1139 +++++ - fs/bcachefs/bcachefs_format.h | 2243 ++++++++++ + fs/bcachefs/bcachefs.h | 1185 +++++ + fs/bcachefs/bcachefs_format.h | 2319 ++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ fs/bcachefs/bkey.c | 1063 +++++ fs/bcachefs/bkey.h | 774 ++++ fs/bcachefs/bkey_buf.h | 61 + fs/bcachefs/bkey_cmp.h | 129 + - fs/bcachefs/bkey_methods.c | 520 +++ - fs/bcachefs/bkey_methods.h | 169 + + fs/bcachefs/bkey_methods.c | 519 +++ + fs/bcachefs/bkey_methods.h | 191 + fs/bcachefs/bkey_sort.c | 201 + fs/bcachefs/bkey_sort.h | 44 + - fs/bcachefs/bset.c | 1588 +++++++ + fs/bcachefs/bset.c | 1587 +++++++ fs/bcachefs/bset.h | 541 +++ - fs/bcachefs/btree_cache.c | 1273 ++++++ - fs/bcachefs/btree_cache.h | 106 + - fs/bcachefs/btree_gc.c | 2130 +++++++++ + fs/bcachefs/btree_cache.c | 1277 ++++++ + fs/bcachefs/btree_cache.h | 130 + + fs/bcachefs/btree_gc.c | 2144 +++++++++ fs/bcachefs/btree_gc.h | 112 + - fs/bcachefs/btree_io.c | 2261 ++++++++++ + fs/bcachefs/btree_io.c | 2266 ++++++++++ fs/bcachefs/btree_io.h | 228 + - fs/bcachefs/btree_iter.c | 3214 ++++++++++++++ - fs/bcachefs/btree_iter.h | 916 ++++ - fs/bcachefs/btree_key_cache.c | 1087 +++++ + fs/bcachefs/btree_iter.c | 3214 +++++++++++++ + fs/bcachefs/btree_iter.h | 924 ++++ + fs/bcachefs/btree_key_cache.c | 1088 +++++ fs/bcachefs/btree_key_cache.h | 48 + - fs/bcachefs/btree_locking.c | 804 ++++ + fs/bcachefs/btree_locking.c | 797 ++++ fs/bcachefs/btree_locking.h | 424 ++ - fs/bcachefs/btree_types.h | 737 +++ + fs/bcachefs/btree_types.h | 742 +++ fs/bcachefs/btree_update.h | 357 ++ - fs/bcachefs/btree_update_interior.c | 2477 +++++++++++ + fs/bcachefs/btree_update_interior.c | 2488 ++++++++++ fs/bcachefs/btree_update_interior.h | 328 ++ - fs/bcachefs/btree_update_leaf.c | 2050 +++++++++ - fs/bcachefs/btree_write_buffer.c | 345 ++ + fs/bcachefs/btree_update_leaf.c | 2065 +++++++++ + fs/bcachefs/btree_write_buffer.c | 346 ++ fs/bcachefs/btree_write_buffer.h | 14 + fs/bcachefs/btree_write_buffer_types.h | 44 + - fs/bcachefs/buckets.c | 2200 +++++++++ - fs/bcachefs/buckets.h | 371 ++ + fs/bcachefs/buckets.c | 2171 +++++++++ + fs/bcachefs/buckets.h | 357 ++ fs/bcachefs/buckets_types.h | 92 + fs/bcachefs/buckets_waiting_for_journal.c | 166 + fs/bcachefs/buckets_waiting_for_journal.h | 15 + .../buckets_waiting_for_journal_types.h | 23 + fs/bcachefs/chardev.c | 769 ++++ fs/bcachefs/chardev.h | 31 + - fs/bcachefs/checksum.c | 712 +++ - fs/bcachefs/checksum.h | 215 + + fs/bcachefs/checksum.c | 709 +++ + fs/bcachefs/checksum.h | 209 + fs/bcachefs/clock.c | 193 + fs/bcachefs/clock.h | 38 + fs/bcachefs/clock_types.h | 37 + - fs/bcachefs/compress.c | 638 +++ - fs/bcachefs/compress.h | 18 + + fs/bcachefs/compress.c | 712 +++ + fs/bcachefs/compress.h | 55 + fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + fs/bcachefs/darray.h | 87 + - fs/bcachefs/data_update.c | 564 +++ + fs/bcachefs/data_update.c | 562 +++ fs/bcachefs/data_update.h | 43 + fs/bcachefs/debug.c | 957 ++++ fs/bcachefs/debug.h | 32 + - fs/bcachefs/dirent.c | 564 +++ - fs/bcachefs/dirent.h | 68 + - fs/bcachefs/disk_groups.c | 548 +++ - fs/bcachefs/disk_groups.h | 101 + - fs/bcachefs/ec.c | 1957 ++++++++ - fs/bcachefs/ec.h | 261 ++ + fs/bcachefs/dirent.c | 565 +++ + fs/bcachefs/dirent.h | 70 + + fs/bcachefs/disk_groups.c | 555 +++ + fs/bcachefs/disk_groups.h | 106 + + fs/bcachefs/ec.c | 1960 ++++++++ + fs/bcachefs/ec.h | 263 ++ fs/bcachefs/ec_types.h | 41 + fs/bcachefs/errcode.c | 63 + - fs/bcachefs/errcode.h | 243 + + fs/bcachefs/errcode.h | 246 + fs/bcachefs/error.c | 297 ++ - fs/bcachefs/error.h | 213 + + fs/bcachefs/error.h | 206 + fs/bcachefs/extent_update.c | 173 + fs/bcachefs/extent_update.h | 12 + - fs/bcachefs/extents.c | 1390 ++++++ - fs/bcachefs/extents.h | 758 ++++ + fs/bcachefs/extents.c | 1394 ++++++ + fs/bcachefs/extents.h | 757 ++++ fs/bcachefs/extents_types.h | 40 + fs/bcachefs/eytzinger.h | 281 ++ fs/bcachefs/fifo.h | 127 + fs/bcachefs/fs-common.c | 501 +++ fs/bcachefs/fs-common.h | 43 + - fs/bcachefs/fs-io.c | 3948 +++++++++++++++++ + fs/bcachefs/fs-io.c | 3982 +++++++++++++++++ fs/bcachefs/fs-io.h | 54 + fs/bcachefs/fs-ioctl.c | 556 +++ fs/bcachefs/fs-ioctl.h | 81 + fs/bcachefs/fs.c | 1943 ++++++++ - fs/bcachefs/fs.h | 206 + - fs/bcachefs/fsck.c | 2505 +++++++++++ - fs/bcachefs/fsck.h | 8 + - fs/bcachefs/inode.c | 868 ++++ - fs/bcachefs/inode.h | 192 + + fs/bcachefs/fs.h | 208 + + fs/bcachefs/fsck.c | 2452 ++++++++++ + fs/bcachefs/fsck.h | 14 + + fs/bcachefs/inode.c | 872 ++++ + fs/bcachefs/inode.h | 196 + fs/bcachefs/io.c | 3056 +++++++++++++ fs/bcachefs/io.h | 202 + fs/bcachefs/io_types.h | 165 + - fs/bcachefs/journal.c | 1448 ++++++ + fs/bcachefs/journal.c | 1438 ++++++ fs/bcachefs/journal.h | 526 +++ - fs/bcachefs/journal_io.c | 1858 ++++++++ + fs/bcachefs/journal_io.c | 1863 ++++++++ fs/bcachefs/journal_io.h | 64 + - fs/bcachefs/journal_reclaim.c | 863 ++++ + fs/bcachefs/journal_reclaim.c | 873 ++++ fs/bcachefs/journal_reclaim.h | 86 + fs/bcachefs/journal_sb.c | 219 + fs/bcachefs/journal_sb.h | 24 + @@ -133,10 +147,10 @@ Signed-off-by: Piotr Gorski fs/bcachefs/keylist.h | 74 + fs/bcachefs/keylist_types.h | 16 + fs/bcachefs/lru.c | 178 + - fs/bcachefs/lru.h | 63 + + fs/bcachefs/lru.h | 64 + fs/bcachefs/migrate.c | 182 + fs/bcachefs/migrate.h | 7 + - fs/bcachefs/move.c | 1162 +++++ + fs/bcachefs/move.c | 1168 +++++ fs/bcachefs/move.h | 96 + fs/bcachefs/move_types.h | 36 + fs/bcachefs/movinggc.c | 421 ++ @@ -144,33 +158,33 @@ Signed-off-by: Piotr Gorski fs/bcachefs/nocow_locking.c | 123 + fs/bcachefs/nocow_locking.h | 49 + fs/bcachefs/nocow_locking_types.h | 20 + - fs/bcachefs/opts.c | 550 +++ - fs/bcachefs/opts.h | 542 +++ + fs/bcachefs/opts.c | 592 +++ + fs/bcachefs/opts.h | 563 +++ fs/bcachefs/printbuf.c | 415 ++ fs/bcachefs/printbuf.h | 284 ++ - fs/bcachefs/quota.c | 980 ++++ - fs/bcachefs/quota.h | 72 + + fs/bcachefs/quota.c | 981 ++++ + fs/bcachefs/quota.h | 74 + fs/bcachefs/quota_types.h | 43 + - fs/bcachefs/rebalance.c | 363 ++ + fs/bcachefs/rebalance.c | 364 ++ fs/bcachefs/rebalance.h | 28 + fs/bcachefs/rebalance_types.h | 26 + - fs/bcachefs/recovery.c | 1653 +++++++ - fs/bcachefs/recovery.h | 58 + - fs/bcachefs/reflink.c | 388 ++ - fs/bcachefs/reflink.h | 79 + - fs/bcachefs/replicas.c | 1056 +++++ + fs/bcachefs/recovery.c | 1669 +++++++ + fs/bcachefs/recovery.h | 60 + + fs/bcachefs/reflink.c | 399 ++ + fs/bcachefs/reflink.h | 81 + + fs/bcachefs/replicas.c | 1059 +++++ fs/bcachefs/replicas.h | 91 + fs/bcachefs/replicas_types.h | 27 + fs/bcachefs/seqmutex.h | 48 + fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + fs/bcachefs/str_hash.h | 370 ++ - fs/bcachefs/subvolume.c | 1505 +++++++ - fs/bcachefs/subvolume.h | 167 + - fs/bcachefs/subvolume_types.h | 22 + - fs/bcachefs/super-io.c | 1616 +++++++ - fs/bcachefs/super-io.h | 134 + - fs/bcachefs/super.c | 1995 +++++++++ + fs/bcachefs/subvolume.c | 1734 +++++++ + fs/bcachefs/subvolume.h | 251 ++ + fs/bcachefs/subvolume_types.h | 31 + + fs/bcachefs/super-io.c | 1711 +++++++ + fs/bcachefs/super-io.h | 142 + + fs/bcachefs/super.c | 2006 +++++++++ fs/bcachefs/super.h | 266 ++ fs/bcachefs/super_types.h | 51 + fs/bcachefs/sysfs.c | 1064 +++++ @@ -182,12 +196,12 @@ Signed-off-by: Piotr Gorski fs/bcachefs/two_state_shared_lock.c | 8 + fs/bcachefs/two_state_shared_lock.h | 59 + fs/bcachefs/util.c | 1137 +++++ - fs/bcachefs/util.h | 842 ++++ - fs/bcachefs/varint.c | 121 + + fs/bcachefs/util.h | 846 ++++ + fs/bcachefs/varint.c | 122 + fs/bcachefs/varint.h | 11 + fs/bcachefs/vstructs.h | 63 + fs/bcachefs/xattr.c | 648 +++ - fs/bcachefs/xattr.h | 51 + + fs/bcachefs/xattr.h | 50 + fs/dcache.c | 12 +- fs/inode.c | 218 +- fs/iomap/buffered-io.c | 45 +- @@ -195,54 +209,111 @@ Signed-off-by: Piotr Gorski fs/xfs/xfs_iomap.c | 3 + fs/xfs/xfs_mount.h | 2 + fs/xfs/xfs_super.c | 6 +- + include/asm-generic/codetag.lds.h | 15 + + include/asm-generic/vmlinux.lds.h | 3 + + include/linux/alloc_tag.h | 160 + include/linux/bio.h | 7 +- include/linux/blkdev.h | 1 + .../md/bcache => include/linux}/closure.h | 46 +- - include/linux/compiler_attributes.h | 5 + + include/linux/codetag.h | 110 + include/linux/dcache.h | 1 + + include/linux/dma-map-ops.h | 2 +- + include/linux/dynamic_fault.h | 79 + include/linux/exportfs.h | 6 + - include/linux/fs.h | 10 +- + include/linux/fortify-string.h | 5 +- + include/linux/fs.h | 16 +- include/linux/generic-radix-tree.h | 68 +- + include/linux/gfp.h | 111 +- + include/linux/gfp_types.h | 101 +- + include/linux/hrtimer.h | 2 +- include/linux/iomap.h | 1 + include/linux/list_bl.h | 22 + include/linux/lockdep.h | 10 + include/linux/lockdep_types.h | 2 +- include/linux/mean_and_variance.h | 198 + - include/linux/sched.h | 3 +- + include/linux/memcontrol.h | 56 +- + include/linux/mempool.h | 73 +- + include/linux/mm.h | 8 + + include/linux/mm_types.h | 4 +- + include/linux/nodemask.h | 2 +- + include/linux/nodemask_types.h | 9 + + include/linux/page_ext.h | 1 - + include/linux/pagemap.h | 9 +- + include/linux/percpu.h | 19 +- + include/linux/pgalloc_tag.h | 105 + + include/linux/prandom.h | 1 - + include/linux/rhashtable-types.h | 9 +- + include/linux/sched.h | 29 +- include/linux/seq_buf.h | 2 + include/linux/shrinker.h | 9 +- include/linux/six.h | 388 ++ - include/linux/string_helpers.h | 4 +- - include/linux/uio.h | 2 + + include/linux/slab.h | 180 +- + include/linux/slab_def.h | 2 +- + include/linux/slub_def.h | 4 +- + include/linux/string.h | 5 +- + include/linux/string_helpers.h | 13 +- + include/linux/time_namespace.h | 2 + + include/linux/vmalloc.h | 60 +- + init/Kconfig | 4 + init/init_task.c | 1 + kernel/Kconfig.locks | 3 + + kernel/dma/mapping.c | 4 +- kernel/locking/Makefile | 1 + kernel/locking/lockdep.c | 46 + + kernel/locking/osq_lock.c | 2 + kernel/locking/six.c | 893 ++++ + kernel/module/main.c | 25 +- kernel/stacktrace.c | 2 + lib/Kconfig | 3 + - lib/Kconfig.debug | 18 + - lib/Makefile | 4 +- + lib/Kconfig.debug | 54 + + lib/Makefile | 9 +- + lib/alloc_tag.c | 225 + {drivers/md/bcache => lib}/closure.c | 36 +- + lib/codetag.c | 393 ++ + lib/dynamic_fault.c | 371 ++ lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- - lib/iov_iter.c | 53 +- + lib/iov_iter.c | 43 +- lib/math/Kconfig | 3 + lib/math/Makefile | 2 + lib/math/mean_and_variance.c | 158 + lib/math/mean_and_variance_test.c | 239 + - lib/rhashtable.c | 9 +- + lib/rhashtable.c | 42 +- lib/seq_buf.c | 10 + - lib/string_helpers.c | 8 +- + lib/string.c | 19 + + lib/string_helpers.c | 26 +- + lib/test-string_helpers.c | 4 +- mm/Makefile | 2 +- + mm/compaction.c | 10 +- + mm/filemap.c | 6 +- + mm/huge_memory.c | 2 + + mm/hugetlb.c | 8 +- + mm/kfence/core.c | 14 +- + mm/kfence/kfence.h | 4 +- + mm/madvise.c | 61 + + mm/memcontrol.c | 56 +- + mm/mempolicy.c | 42 +- + mm/mempool.c | 34 +- + mm/mm_init.c | 1 + mm/oom_kill.c | 23 - - {lib => mm}/show_mem.c | 23 + - mm/slab.h | 6 +- - mm/slab_common.c | 52 +- + mm/page_alloc.c | 66 +- + mm/page_ext.c | 13 + + mm/page_owner.c | 2 +- + mm/percpu-internal.h | 26 +- + mm/percpu.c | 120 +- + {lib => mm}/show_mem.c | 37 + + mm/slab.c | 24 +- + mm/slab.h | 252 +- + mm/slab_common.c | 148 +- + mm/slub.c | 26 +- + mm/util.c | 44 +- + mm/vmalloc.c | 88 +- mm/vmscan.c | 99 +- scripts/Kbuild.include | 10 + scripts/Makefile.lib | 2 +- - 237 files changed, 93049 insertions(+), 249 deletions(-) + scripts/kallsyms.c | 13 + + scripts/module.lds.S | 7 + + 308 files changed, 96643 insertions(+), 930 deletions(-) create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile create mode 100644 fs/bcachefs/acl.c @@ -413,17 +484,106 @@ Signed-off-by: Piotr Gorski create mode 100644 fs/bcachefs/vstructs.h create mode 100644 fs/bcachefs/xattr.c create mode 100644 fs/bcachefs/xattr.h + create mode 100644 include/asm-generic/codetag.lds.h + create mode 100644 include/linux/alloc_tag.h rename {drivers/md/bcache => include/linux}/closure.h (93%) + create mode 100644 include/linux/codetag.h + create mode 100644 include/linux/dynamic_fault.h create mode 100644 include/linux/mean_and_variance.h + create mode 100644 include/linux/nodemask_types.h + create mode 100644 include/linux/pgalloc_tag.h create mode 100644 include/linux/six.h create mode 100644 kernel/locking/six.c + create mode 100644 lib/alloc_tag.c rename {drivers/md/bcache => lib}/closure.c (88%) + create mode 100644 lib/codetag.c + create mode 100644 lib/dynamic_fault.c create mode 100644 lib/math/mean_and_variance.c create mode 100644 lib/math/mean_and_variance_test.c - rename {lib => mm}/show_mem.c (69%) + rename {lib => mm}/show_mem.c (57%) +diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst +index 45ba1f4dc..0a012ac13 100644 +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: + - legacy_va_layout + - lowmem_reserve_ratio + - max_map_count ++- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) + - memory_failure_early_kill + - memory_failure_recovery + - min_free_kbytes +@@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. + The default value is 65530. + + ++mem_profiling ++============== ++ ++Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) ++ ++1: Enable memory profiling. ++ ++0: Disabld memory profiling. ++ ++Enabling memory profiling introduces a small performance overhead for all ++memory allocations. ++ ++The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. ++ ++ + memory_failure_early_kill: + ========================== + +diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst +index 7897a7daf..810f851e6 100644 +--- a/Documentation/filesystems/proc.rst ++++ b/Documentation/filesystems/proc.rst +@@ -683,6 +683,7 @@ files are there, and which are missing. + ============ =============================================================== + File Content + ============ =============================================================== ++ allocinfo Memory allocations profiling information + apm Advanced power management info + buddyinfo Kernel memory allocator information (see text) (2.5) + bus Directory containing bus specific information +@@ -942,6 +943,33 @@ also be allocatable although a lot of filesystem metadata may have to be + reclaimed to achieve this. + + ++allocinfo ++~~~~~~~ ++ ++Provides information about memory allocations at all locations in the code ++base. Each allocation in the code is identified by its source file, line ++number, module and the function calling the allocation. The number of bytes ++allocated at each location is reported. ++ ++Example output. ++ ++:: ++ ++ > cat /proc/allocinfo ++ ++ 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page ++ 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order ++ 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts ++ 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact ++ 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one ++ 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc ++ 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare ++ 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc ++ 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func ++ 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf ++ ... ++ ++ + meminfo + ~~~~~~~ + diff --git a/MAINTAINERS b/MAINTAINERS -index 35e195946..e1ac2fc50 100644 +index 35e195946..48763cc35 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3522,6 +3522,13 @@ W: http://bcache.evilpiepirate.org @@ -455,7 +615,21 @@ index 35e195946..e1ac2fc50 100644 CMPC ACPI DRIVER M: Thadeu Lima de Souza Cascardo M: Daniel Oliveira Nascimento -@@ -8662,6 +8677,13 @@ F: Documentation/devicetree/bindings/power/power?domain* +@@ -5114,6 +5129,13 @@ S: Supported + F: Documentation/process/code-of-conduct-interpretation.rst + F: Documentation/process/code-of-conduct.rst + ++CODE TAGGING ++M: Suren Baghdasaryan ++M: Kent Overstreet ++S: Maintained ++F: include/linux/codetag.h ++F: lib/codetag.c ++ + COMEDI DRIVERS + M: Ian Abbott + M: H Hartley Sweeten +@@ -8662,6 +8684,13 @@ F: Documentation/devicetree/bindings/power/power?domain* F: drivers/base/power/domain*.c F: include/linux/pm_domain.h @@ -469,7 +643,7 @@ index 35e195946..e1ac2fc50 100644 GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER M: Eugen Hristev L: linux-input@vger.kernel.org -@@ -12850,6 +12872,15 @@ F: Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt +@@ -12850,6 +12879,15 @@ F: Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt F: drivers/net/ieee802154/mcr20a.c F: drivers/net/ieee802154/mcr20a.h @@ -485,7 +659,23 @@ index 35e195946..e1ac2fc50 100644 MEASUREMENT COMPUTING CIO-DAC IIO DRIVER M: William Breathitt Gray L: linux-iio@vger.kernel.org -@@ -19376,6 +19407,14 @@ S: Maintained +@@ -13489,6 +13527,15 @@ F: mm/memblock.c + F: mm/mm_init.c + F: tools/testing/memblock/ + ++MEMORY ALLOCATION PROFILING ++M: Suren Baghdasaryan ++M: Kent Overstreet ++S: Maintained ++F: include/linux/alloc_tag.h ++F: include/linux/codetag_ctx.h ++F: lib/alloc_tag.c ++F: lib/pgalloc_tag.c ++ + MEMORY CONTROLLER DRIVERS + M: Krzysztof Kozlowski + L: linux-kernel@vger.kernel.org +@@ -19376,6 +19423,14 @@ S: Maintained W: http://www.winischhofer.at/linuxsisusbvga.shtml F: drivers/usb/misc/sisusbvga/ @@ -500,6 +690,47 @@ index 35e195946..e1ac2fc50 100644 SL28 CPLD MFD DRIVER M: Michael Walle S: Maintained +diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h +index db7b371b3..31823d971 100644 +--- a/arch/arm64/include/asm/spectre.h ++++ b/arch/arm64/include/asm/spectre.h +@@ -13,8 +13,8 @@ + #define __BP_HARDEN_HYP_VECS_SZ ((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K) + + #ifndef __ASSEMBLY__ +- +-#include ++#include ++#include + + #include + #include +diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c +index 2297aa764..4f8d43b74 100644 +--- a/arch/powerpc/mm/book3s64/radix_pgtable.c ++++ b/arch/powerpc/mm/book3s64/radix_pgtable.c +@@ -261,7 +261,7 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e + if (end <= start) + return; + +- string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); ++ string_get_size(size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); + + pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, + exec ? " (exec)" : ""); +diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c +index 56a917df4..842a0ec5e 100644 +--- a/arch/x86/kernel/amd_gart_64.c ++++ b/arch/x86/kernel/amd_gart_64.c +@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { + .get_sgtable = dma_common_get_sgtable, + .dma_supported = dma_direct_supported, + .get_required_mask = dma_direct_get_required_mask, +- .alloc_pages = dma_direct_alloc_pages, ++ .alloc_pages_op = dma_direct_alloc_pages, + .free_pages = dma_direct_free_pages, + }; + diff --git a/block/bdev.c b/block/bdev.c index 21c63bfef..a4d7e8732 100644 --- a/block/bdev.c @@ -601,6 +832,48 @@ index 45547bcf1..f20f9ca03 100644 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); +diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c +index b47358da9..be10661f1 100644 +--- a/drivers/block/virtio_blk.c ++++ b/drivers/block/virtio_blk.c +@@ -990,9 +990,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) + nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); + + string_get_size(nblocks, queue_logical_block_size(q), +- STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); ++ STRING_SIZE_BASE2, cap_str_2, sizeof(cap_str_2)); + string_get_size(nblocks, queue_logical_block_size(q), +- STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); ++ 0, cap_str_10, sizeof(cap_str_10)); + + dev_notice(&vdev->dev, + "[%s] %s%llu %d-byte logical blocks (%s/%s)\n", +diff --git a/drivers/gpu/drm/gud/gud_drv.c b/drivers/gpu/drm/gud/gud_drv.c +index 9d7bf8ee4..6b1748e1f 100644 +--- a/drivers/gpu/drm/gud/gud_drv.c ++++ b/drivers/gpu/drm/gud/gud_drv.c +@@ -329,7 +329,7 @@ static int gud_stats_debugfs(struct seq_file *m, void *data) + struct gud_device *gdrm = to_gud_device(entry->dev); + char buf[10]; + +- string_get_size(gdrm->bulk_len, 1, STRING_UNITS_2, buf, sizeof(buf)); ++ string_get_size(gdrm->bulk_len, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); + seq_printf(m, "Max buffer size: %s\n", buf); + seq_printf(m, "Number of errors: %u\n", gdrm->stats_num_errors); + +diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c +index 7a9f0b0bd..76a9d5ca4 100644 +--- a/drivers/iommu/dma-iommu.c ++++ b/drivers/iommu/dma-iommu.c +@@ -1556,7 +1556,7 @@ static const struct dma_map_ops iommu_dma_ops = { + .flags = DMA_F_PCI_P2PDMA_SUPPORTED, + .alloc = iommu_dma_alloc, + .free = iommu_dma_free, +- .alloc_pages = dma_common_alloc_pages, ++ .alloc_pages_op = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, + .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, + .free_noncontiguous = iommu_dma_free_noncontiguous, diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 529c9d04e..b2d10063d 100644 --- a/drivers/md/bcache/Kconfig @@ -695,6 +968,117 @@ index 6f3cb7c92..f61ab1bad 100644 struct closure; #ifdef CONFIG_BCACHE_DEBUG +diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c +index d920c4178..ae9ab7816 100644 +--- a/drivers/mmc/core/block.c ++++ b/drivers/mmc/core/block.c +@@ -2503,7 +2503,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, + + blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); + +- string_get_size((u64)size, 512, STRING_UNITS_2, ++ string_get_size((u64)size, 512, STRING_SIZE_BASE2, + cap_str, sizeof(cap_str)); + pr_info("%s: %s %s %s %s\n", + md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), +@@ -2699,7 +2699,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, + + list_add(&rpmb->node, &md->rpmbs); + +- string_get_size((u64)size, 512, STRING_UNITS_2, ++ string_get_size((u64)size, 512, STRING_SIZE_BASE2, + cap_str, sizeof(cap_str)); + + pr_info("%s: %s %s %s, chardev (%d:%d)\n", +diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c +index e11536fff..9f1ea83e2 100644 +--- a/drivers/mtd/spi-nor/debugfs.c ++++ b/drivers/mtd/spi-nor/debugfs.c +@@ -84,7 +84,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data) + + seq_printf(s, "name\t\t%s\n", info->name); + seq_printf(s, "id\t\t%*ph\n", SPI_NOR_MAX_ID_LEN, nor->id); +- string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); ++ string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); + seq_printf(s, "size\t\t%s\n", buf); + seq_printf(s, "write size\t%u\n", params->writesize); + seq_printf(s, "page size\t%u\n", params->page_size); +@@ -129,14 +129,14 @@ static int spi_nor_params_show(struct seq_file *s, void *data) + struct spi_nor_erase_type *et = &erase_map->erase_type[i]; + + if (et->size) { +- string_get_size(et->size, 1, STRING_UNITS_2, buf, ++ string_get_size(et->size, 1, STRING_SIZE_BASE2, buf, + sizeof(buf)); + seq_printf(s, " %02x (%s) [%d]\n", et->opcode, buf, i); + } + } + + if (!(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) { +- string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); ++ string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); + seq_printf(s, " %02x (%s)\n", SPINOR_OP_CHIP_ERASE, buf); + } + +diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +index 14e0d989c..7d5fbebd3 100644 +--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c ++++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +@@ -3457,8 +3457,8 @@ static void mem_region_show(struct seq_file *seq, const char *name, + { + char buf[40]; + +- string_get_size((u64)to - from + 1, 1, STRING_UNITS_2, buf, +- sizeof(buf)); ++ string_get_size((u64)to - from + 1, 1, STRING_SIZE_BASE2, ++ buf, sizeof(buf)); + seq_printf(seq, "%-15s %#x-%#x [%s]\n", name, from, to, buf); + } + +diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c +index 1624d528a..bf0a1907b 100644 +--- a/drivers/scsi/sd.c ++++ b/drivers/scsi/sd.c +@@ -2580,10 +2580,10 @@ sd_print_capacity(struct scsi_disk *sdkp, + if (!sdkp->first_scan && old_capacity == sdkp->capacity) + return; + +- string_get_size(sdkp->capacity, sector_size, +- STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); +- string_get_size(sdkp->capacity, sector_size, +- STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); ++ string_get_size(sdkp->capacity, sector_size, STRING_SIZE_BASE2, ++ cap_str_2, sizeof(cap_str_2)); ++ string_get_size(sdkp->capacity, sector_size, 0, ++ cap_str_10, sizeof(cap_str_10)); + + sd_printk(KERN_NOTICE, sdkp, + "%llu %d-byte logical blocks: (%s/%s)\n", +diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c +index 9784a77fa..6c7d984f1 100644 +--- a/drivers/xen/grant-dma-ops.c ++++ b/drivers/xen/grant-dma-ops.c +@@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask) + static const struct dma_map_ops xen_grant_dma_ops = { + .alloc = xen_grant_dma_alloc, + .free = xen_grant_dma_free, +- .alloc_pages = xen_grant_dma_alloc_pages, ++ .alloc_pages_op = xen_grant_dma_alloc_pages, + .free_pages = xen_grant_dma_free_pages, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, +diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c +index 67aa74d20..5ab261615 100644 +--- a/drivers/xen/swiotlb-xen.c ++++ b/drivers/xen/swiotlb-xen.c +@@ -403,6 +403,6 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { + .dma_supported = xen_swiotlb_dma_supported, + .mmap = dma_common_mmap, + .get_sgtable = dma_common_get_sgtable, +- .alloc_pages = dma_common_alloc_pages, ++ .alloc_pages_op = dma_common_alloc_pages, + .free_pages = dma_common_free_pages, + }; diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7..b05c45f63 100644 --- a/fs/Kconfig @@ -846,13 +1230,13 @@ index b0b17bd09..b3e14a9fe 100644 diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 -index 000000000..28b585223 +index 000000000..6c698b3b3 --- /dev/null +++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,75 @@ +@@ -0,0 +1,77 @@ + +config BCACHEFS_FS -+ tristate "bcachefs filesystem support" ++ tristate "bcachefs filesystem support (EXPERIMENTAL)" + depends on BLOCK + select EXPORTFS + select CLOSURES @@ -861,6 +1245,8 @@ index 000000000..28b585223 + select FS_POSIX_ACL + select LZ4_COMPRESS + select LZ4_DECOMPRESS ++ select LZ4HC_COMPRESS ++ select LZ4HC_DECOMPRESS + select ZLIB_DEFLATE + select ZLIB_INFLATE + select ZSTD_COMPRESS @@ -1007,10 +1393,10 @@ index 000000000..a71956048 +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 -index 000000000..ce7a460fb +index 000000000..b1a488860 --- /dev/null +++ b/fs/bcachefs/acl.c -@@ -0,0 +1,414 @@ +@@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_POSIX_ACL + @@ -1238,6 +1624,7 @@ index 000000000..ce7a460fb + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); ++ struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c_xattr xattr; @@ -1250,9 +1637,7 @@ index 000000000..ce7a460fb + bch2_trans_begin(&trans); + + ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, -+ &hash, inode_inum(inode), -+ &X_SEARCH(acl_to_xattr_type(type), "", 0), -+ 0); ++ &hash, inode_inum(inode), &search, 0); + if (ret) { + if (!bch2_err_matches(ret, ENOENT)) + acl = ERR_PTR(ret); @@ -1377,6 +1762,7 @@ index 000000000..ce7a460fb + struct posix_acl **new_acl) +{ + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); ++ struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; @@ -1385,9 +1771,7 @@ index 000000000..ce7a460fb + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, -+ &hash_info, inum, -+ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), -+ BTREE_ITER_INTENT); ++ &hash_info, inum, &search, BTREE_ITER_INTENT); + if (ret) + return bch2_err_matches(ret, ENOENT) ? 0 : ret; + @@ -1491,10 +1875,10 @@ index 000000000..bb21d8d69 +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 -index 000000000..7bd472ae5 +index 000000000..8d8481fc1 --- /dev/null +++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,2218 @@ +@@ -0,0 +1,2209 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -1720,7 +2104,8 @@ index 000000000..7bd472ae5 +} + +int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + @@ -1735,7 +2120,8 @@ index 000000000..7bd472ae5 +} + +int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + @@ -1748,7 +2134,8 @@ index 000000000..7bd472ae5 +} + +int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + @@ -1779,8 +2166,8 @@ index 000000000..7bd472ae5 + } + + if (rw == WRITE && -+ !(flags & BKEY_INVALID_FROM_JOURNAL) && -+ test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ !(flags & BKEY_INVALID_JOURNAL) && ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { + unsigned i, bp_len = 0; + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) @@ -1830,7 +2217,7 @@ index 000000000..7bd472ae5 + } + + if (!a.v->io_time[READ] && -+ test_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags)) { ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { + prt_printf(err, "cached bucket with read_time == 0"); + return -BCH_ERR_invalid_bkey; + } @@ -2045,40 +2432,6 @@ index 000000000..7bd472ae5 + return ERR_PTR(ret); +} + -+int bch2_alloc_read(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter iter; -+ struct bkey_s_c k; -+ struct bch_alloc_v4 a; -+ struct bch_dev *ca; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ /* -+ * Not a fsck error because this is checked/repaired by -+ * bch2_check_alloc_key() which runs later: -+ */ -+ if (!bch2_dev_bucket_exists(c, k.k->p)) -+ continue; -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ -+ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; -+ } -+ bch2_trans_iter_exit(&trans, &iter); -+ -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ bch_err_fn(c, ret); -+ -+ return ret; -+} -+ +static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) +{ + *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; @@ -2102,7 +2455,8 @@ index 000000000..7bd472ae5 +} + +int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { + prt_printf(err, "bad val size (%lu != %zu)", @@ -2185,45 +2539,67 @@ index 000000000..7bd472ae5 + return ret; +} + -+int bch2_bucket_gens_read(struct bch_fs *c) ++int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; -+ const struct bch_bucket_gens *g; + struct bch_dev *ca; -+ u64 b; + int ret; + ++ down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); + -+ for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; -+ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; ++ if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { ++ const struct bch_bucket_gens *g; ++ u64 b; + -+ if (k.k->type != KEY_TYPE_bucket_gens) -+ continue; ++ for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; ++ u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + -+ g = bkey_s_c_to_bucket_gens(k).v; ++ if (k.k->type != KEY_TYPE_bucket_gens) ++ continue; + -+ /* -+ * Not a fsck error because this is checked/repaired by -+ * bch2_check_alloc_key() which runs later: -+ */ -+ if (!bch2_dev_exists2(c, k.k->p.inode)) -+ continue; ++ g = bkey_s_c_to_bucket_gens(k).v; + -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ /* ++ * Not a fsck error because this is checked/repaired by ++ * bch2_check_alloc_key() which runs later: ++ */ ++ if (!bch2_dev_exists2(c, k.k->p.inode)) ++ continue; + -+ for (b = max_t(u64, ca->mi.first_bucket, start); -+ b < min_t(u64, ca->mi.nbuckets, end); -+ b++) -+ *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ for (b = max_t(u64, ca->mi.first_bucket, start); ++ b < min_t(u64, ca->mi.nbuckets, end); ++ b++) ++ *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; ++ } ++ bch2_trans_iter_exit(&trans, &iter); ++ } else { ++ struct bch_alloc_v4 a; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ /* ++ * Not a fsck error because this is checked/repaired by ++ * bch2_check_alloc_key() which runs later: ++ */ ++ if (!bch2_dev_bucket_exists(c, k.k->p)) ++ continue; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; ++ } ++ bch2_trans_iter_exit(&trans, &iter); + } -+ bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); ++ up_read(&c->gc_lock); + + if (ret) + bch_err_fn(c, ret); @@ -2282,7 +2658,7 @@ index 000000000..7bd472ae5 + return ret; + + if (ca->mi.freespace_initialized && -+ test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags) && ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && + bch2_trans_inconsistent_on(old.k->type != old_type, trans, + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" + " for %s", @@ -2426,7 +2802,7 @@ index 000000000..7bd472ae5 + * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * extents style btrees, but works on non-extents btrees: + */ -+struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) ++static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +{ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + @@ -2497,7 +2873,7 @@ index 000000000..7bd472ae5 + return ca != NULL; +} + -+struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) ++static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +{ + struct bch_fs *c = iter->trans->c; + struct bkey_s_c k; @@ -2725,8 +3101,7 @@ index 000000000..7bd472ae5 + unsigned i, gens_offset, gens_end_offset; + int ret; + -+ if (c->sb.version < bcachefs_metadata_version_bucket_gens && -+ !c->opts.version_upgrade) ++ if (c->sb.version < bcachefs_metadata_version_bucket_gens) + return 0; + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); @@ -3169,7 +3544,7 @@ index 000000000..7bd472ae5 + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { -+ if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" + "%s", @@ -3182,7 +3557,7 @@ index 000000000..7bd472ae5 + } + + if (a->v.data_type != BCH_DATA_need_discard) { -+ if (test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "bucket incorrectly set in need_discard btree\n" + "%s", @@ -3350,7 +3725,7 @@ index 000000000..7bd472ae5 + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); -+ if (test_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags)) { ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } @@ -3715,10 +4090,10 @@ index 000000000..7bd472ae5 +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 -index 000000000..3c4d6d40b +index 000000000..c0914feb5 --- /dev/null +++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,251 @@ +@@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H @@ -3729,6 +4104,8 @@ index 000000000..3c4d6d40b +#include "debug.h" +#include "super.h" + ++enum bkey_invalid_flags; ++ +/* How out of date a pointer gen is allowed to be: */ +#define BUCKET_GC_GEN_MAX 96U + @@ -3868,10 +4245,14 @@ index 000000000..3c4d6d40b + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + -+int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + @@ -3908,7 +4289,8 @@ index 000000000..3c4d6d40b + .min_val_size = 48, \ +}) + -+int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ @@ -3926,7 +4308,6 @@ index 000000000..3c4d6d40b +} + +int bch2_alloc_read(struct bch_fs *); -+int bch2_bucket_gens_read(struct bch_fs *); + +int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); @@ -3972,10 +4353,10 @@ index 000000000..3c4d6d40b +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 -index 000000000..6650c0001 +index 000000000..fcb7311b1 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1535 @@ +@@ -0,0 +1,1536 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. @@ -4170,6 +4551,7 @@ index 000000000..6650c0001 +{ + switch (watermark) { + case BCH_WATERMARK_reclaim: ++ return 0; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; @@ -4301,7 +4683,7 @@ index 000000000..6650c0001 + a = bch2_alloc_to_v4(k, &a_convert); + + if (a->data_type != BCH_DATA_free) { -+ if (!test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { ++ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + ob = NULL; + goto err; + } @@ -4317,7 +4699,7 @@ index 000000000..6650c0001 + } + + if (genbits != (alloc_freespace_genbits(*a) >> 56) && -+ test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { ++ c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(*a) >> 56); @@ -4327,10 +4709,9 @@ index 000000000..6650c0001 + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; -+ + } + -+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { + struct bch_backpointer bp; + struct bpos bp_pos = POS_MIN; + @@ -4533,7 +4914,7 @@ index 000000000..6650c0001 + if (s.skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + -+ if (!ob && freespace && !test_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags)) { ++ if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + freespace = false; + goto alloc; + } @@ -5171,6 +5552,7 @@ index 000000000..6650c0001 + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); ++ wp->ptrs.nr = 0; + mutex_unlock(&wp->lock); + return true; +} @@ -5875,10 +6257,10 @@ index 000000000..c33a29954 +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 -index 000000000..a270ff96e +index 000000000..d412bae55 --- /dev/null +++ b/fs/bcachefs/backpointers.c -@@ -0,0 +1,886 @@ +@@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" @@ -5919,7 +6301,8 @@ index 000000000..a270ff96e +} + +int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); @@ -5984,7 +6367,7 @@ index 000000000..a270ff96e + bch2_bkey_val_to_text(&buf, c, orig_k); + + bch_err(c, "%s", buf.buf); -+ } else if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + prt_printf(&buf, "backpointer not found when deleting"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); @@ -6005,7 +6388,7 @@ index 000000000..a270ff96e + + printbuf_exit(&buf); + -+ if (test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) { ++ if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + bch2_inconsistent_error(c); + return -EIO; + } else { @@ -6138,7 +6521,7 @@ index 000000000..a270ff96e + bch2_backpointer_to_text(&buf, &bp); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); -+ if (!test_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags)) ++ if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) + bch_err_ratelimited(c, "%s", buf.buf); + else + bch2_trans_inconsistent(trans, "%s", buf.buf); @@ -6153,6 +6536,7 @@ index 000000000..a270ff96e + unsigned iter_flags) +{ + struct bch_fs *c = trans->c; ++ struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct bkey_s_c k; + @@ -6160,7 +6544,7 @@ index 000000000..a270ff96e + bp.btree_id, + bp.pos, + 0, -+ min(bp.level, c->btree_roots[bp.btree_id].level), ++ min(bp.level, r->level), + iter_flags); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { @@ -6168,8 +6552,8 @@ index 000000000..a270ff96e + return k; + } + -+ if (bp.level == c->btree_roots[bp.btree_id].level + 1) -+ k = bkey_i_to_s_c(&c->btree_roots[bp.btree_id].key); ++ if (bp.level == r->level + 1) ++ k = bkey_i_to_s_c(&r->key); + + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; @@ -6412,6 +6796,7 @@ index 000000000..a270ff96e + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; ++ struct btree_root *r = bch2_btree_id_root(c, btree_id); + struct btree_iter iter; + struct btree *b; + struct bkey_s_c k; @@ -6420,8 +6805,7 @@ index 000000000..a270ff96e + const union bch_extent_entry *entry; + int ret; + -+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, -+ c->btree_roots[btree_id].level, 0); ++ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) @@ -6470,10 +6854,10 @@ index 000000000..a270ff96e + return div_u64(mem_bytes >> 1, btree_bytes(c)); +} + -+int bch2_get_btree_in_memory_pos(struct btree_trans *trans, -+ unsigned btree_leaf_mask, -+ unsigned btree_interior_mask, -+ struct bbpos start, struct bbpos *end) ++static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, ++ unsigned btree_leaf_mask, ++ unsigned btree_interior_mask, ++ struct bbpos start, struct bbpos *end) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -6521,12 +6905,13 @@ index 000000000..a270ff96e + struct bpos bucket_start, + struct bpos bucket_end) +{ ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + enum btree_id btree_id; + struct bpos_level last_flushed = { UINT_MAX }; + int ret = 0; + -+ for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { ++ for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, @@ -6570,8 +6955,8 @@ index 000000000..a270ff96e + : bucket; +} + -+int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, -+ struct bpos start, struct bpos *end) ++static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, ++ struct bpos start, struct bpos *end) +{ + struct btree_iter alloc_iter; + struct btree_iter bp_iter; @@ -6767,7 +7152,7 @@ index 000000000..a270ff96e +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 -index 000000000..c52954e2e +index 000000000..87e31aa19 --- /dev/null +++ b/fs/bcachefs/backpointers.h @@ -0,0 +1,131 @@ @@ -6781,7 +7166,7 @@ index 000000000..c52954e2e +#include "super.h" + +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); @@ -6857,7 +7242,7 @@ index 000000000..c52954e2e + set_bkey_val_u64s(&bp_k->k, 0); + } + -+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i, !insert); ++ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); +} + +static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, @@ -6958,10 +7343,10 @@ index 000000000..1fbed1f83 +#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 -index 000000000..b8d50fe64 +index 000000000..445d010c8 --- /dev/null +++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,1139 @@ +@@ -0,0 +1,1185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H @@ -7528,11 +7913,6 @@ index 000000000..b8d50fe64 + + /* fsck passes: */ + BCH_FS_TOPOLOGY_REPAIR_DONE, -+ BCH_FS_INITIAL_GC_DONE, /* kill when we enumerate fsck passes */ -+ BCH_FS_CHECK_ALLOC_DONE, -+ BCH_FS_CHECK_LRUS_DONE, -+ BCH_FS_CHECK_BACKPOINTERS_DONE, -+ BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, @@ -7625,6 +8005,47 @@ index 000000000..b8d50fe64 + BCH_WRITE_REF_NR, +}; + ++#define PASS_SILENT BIT(0) ++#define PASS_FSCK BIT(1) ++#define PASS_UNCLEAN BIT(2) ++#define PASS_ALWAYS BIT(3) ++ ++#define BCH_RECOVERY_PASSES() \ ++ x(alloc_read, PASS_ALWAYS) \ ++ x(stripes_read, PASS_ALWAYS) \ ++ x(initialize_subvolumes, 0) \ ++ x(snapshots_read, PASS_ALWAYS) \ ++ x(check_allocations, PASS_FSCK) \ ++ x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ ++ x(journal_replay, PASS_ALWAYS) \ ++ x(check_alloc_info, PASS_FSCK) \ ++ x(check_lrus, PASS_FSCK) \ ++ x(check_btree_backpointers, PASS_FSCK) \ ++ x(check_backpointers_to_extents,PASS_FSCK) \ ++ x(check_extents_to_backpointers,PASS_FSCK) \ ++ x(check_alloc_to_lru_refs, PASS_FSCK) \ ++ x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ ++ x(bucket_gens_init, 0) \ ++ x(fs_upgrade_for_subvolumes, 0) \ ++ x(check_snapshot_trees, PASS_FSCK) \ ++ x(check_snapshots, PASS_FSCK) \ ++ x(check_subvols, PASS_FSCK) \ ++ x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN|PASS_SILENT) \ ++ x(check_inodes, PASS_FSCK|PASS_UNCLEAN) \ ++ x(check_extents, PASS_FSCK) \ ++ x(check_dirents, PASS_FSCK) \ ++ x(check_xattrs, PASS_FSCK) \ ++ x(check_root, PASS_FSCK) \ ++ x(check_directory_structure, PASS_FSCK) \ ++ x(check_nlinks, PASS_FSCK) \ ++ x(fix_reflink_p, 0) \ ++ ++enum bch_recovery_pass { ++#define x(n, when) BCH_RECOVERY_PASS_##n, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ +struct bch_fs { + struct closure cl; + @@ -7676,6 +8097,7 @@ index 000000000..b8d50fe64 + + u16 version; + u16 version_min; ++ u16 version_upgrade_complete; + + u8 nr_devices; + u8 clean; @@ -7701,9 +8123,10 @@ index 000000000..b8d50fe64 + struct mutex sb_lock; + + /* snapshot.c: */ -+ GENRADIX(struct snapshot_t) snapshots; -+ struct bch_snapshot_table __rcu *snapshot_table; ++ struct snapshot_table __rcu *snapshots; ++ size_t snapshot_table_size; + struct mutex snapshot_table_lock; ++ + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + snapshot_id_list snapshots_unlinked; @@ -7713,7 +8136,8 @@ index 000000000..b8d50fe64 + struct bio_set btree_bio; + struct workqueue_struct *io_complete_wq; + -+ struct btree_root btree_roots[BTREE_ID_NR]; ++ struct btree_root btree_roots_known[BTREE_ID_NR]; ++ DARRAY(struct btree_root) btree_roots_extra; + struct mutex btree_root_lock; + + struct btree_cache btree_cache; @@ -7954,6 +8378,13 @@ index 000000000..b8d50fe64 + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + ++ /* RECOVERY */ ++ u64 journal_replay_seq_start; ++ u64 journal_replay_seq_end; ++ enum bch_recovery_pass curr_recovery_pass; ++ /* bitmap of explicitly enabled recovery passes: */ ++ u64 recovery_passes_explicit; ++ + /* DEBUG JUNK */ + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; @@ -8103,10 +8534,10 @@ index 000000000..b8d50fe64 +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 -index 000000000..4401d2767 +index 000000000..5c308f842 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,2243 @@ +@@ -0,0 +1,2319 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H @@ -8804,7 +9235,7 @@ index 000000000..4401d2767 +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ -+ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ @@ -8816,7 +9247,7 @@ index 000000000..4401d2767 +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ -+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + @@ -8858,7 +9289,7 @@ index 000000000..4401d2767 +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 -+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(u64)) ++#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; @@ -9025,7 +9456,7 @@ index 000000000..4401d2767 +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + -+#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(u64) - \ ++#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \ + sizeof(struct bkey) - \ + offsetof(struct bch_dirent, d_name))) + @@ -9118,7 +9549,7 @@ index 000000000..4401d2767 +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 -+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(u64)) ++#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) @@ -9257,6 +9688,8 @@ index 000000000..4401d2767 + __le32 children[2]; + __le32 subvol; + __le32 tree; ++ __le32 depth; ++ __le32 skip[3]; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) @@ -9396,10 +9829,10 @@ index 000000000..4401d2767 +}; + +#define BCH_KEY_MAGIC \ -+ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ -+ ((u64) 'h' << 16)|((u64) '*' << 24)| \ -+ ((u64) '*' << 32)|((u64) 'k' << 40)| \ -+ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ ++ ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ ++ ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ ++ ((__u64) 'e' << 48)|((__u64) 'y' << 56)) + +struct bch_encrypted_key { + __le64 magic; @@ -9480,19 +9913,19 @@ index 000000000..4401d2767 +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; -+ __u8 devs[]; ++ __u8 devs[0]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; -+ struct bch_replicas_entry_v0 entries[]; ++ struct bch_replicas_entry_v0 entries[0]; +} __packed __aligned(8); + +struct bch_replicas_entry { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; -+ __u8 devs[]; ++ __u8 devs[0]; +} __packed; + +#define replicas_entry_bytes(_i) \ @@ -9500,7 +9933,7 @@ index 000000000..4401d2767 + +struct bch_sb_field_replicas { + struct bch_sb_field field; -+ struct bch_replicas_entry entries[]; ++ struct bch_replicas_entry entries[0]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ @@ -9683,38 +10116,70 @@ index 000000000..4401d2767 + * One common version number for all on disk data structures - superblock, btree + * nodes, journal entries + */ ++#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) ++#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) ++#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) + -+#define BCH_METADATA_VERSIONS() \ -+ x(bkey_renumber, 10) \ -+ x(inode_btree_change, 11) \ -+ x(snapshot, 12) \ -+ x(inode_backpointers, 13) \ -+ x(btree_ptr_sectors_written, 14) \ -+ x(snapshot_2, 15) \ -+ x(reflink_p_fix, 16) \ -+ x(subvol_dirent, 17) \ -+ x(inode_v2, 18) \ -+ x(freespace, 19) \ -+ x(alloc_v4, 20) \ -+ x(new_data_types, 21) \ -+ x(backpointers, 22) \ -+ x(inode_v3, 23) \ -+ x(unwritten_extents, 24) \ -+ x(bucket_gens, 25) \ -+ x(lru_v2, 26) \ -+ x(fragmentation_lru, 27) \ -+ x(no_bps_in_alloc_keys, 28) \ -+ x(snapshot_trees, 29) ++#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) ++ ++#define BCH_METADATA_VERSIONS() \ ++ x(bkey_renumber, BCH_VERSION(0, 10), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(inode_btree_change, BCH_VERSION(0, 11), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(snapshot, BCH_VERSION(0, 12), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(inode_backpointers, BCH_VERSION(0, 13), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(snapshot_2, BCH_VERSION(0, 15), \ ++ BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ ++ BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(reflink_p_fix, BCH_VERSION(0, 16), \ ++ BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ ++ x(subvol_dirent, BCH_VERSION(0, 17), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(inode_v2, BCH_VERSION(0, 18), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(freespace, BCH_VERSION(0, 19), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(alloc_v4, BCH_VERSION(0, 20), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(new_data_types, BCH_VERSION(0, 21), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(backpointers, BCH_VERSION(0, 22), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(inode_v3, BCH_VERSION(0, 23), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(unwritten_extents, BCH_VERSION(0, 24), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(bucket_gens, BCH_VERSION(0, 25), \ ++ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(lru_v2, BCH_VERSION(0, 26), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(fragmentation_lru, BCH_VERSION(0, 27), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(snapshot_trees, BCH_VERSION(0, 29), \ ++ RECOVERY_PASS_ALL_FSCK) \ ++ x(major_minor, BCH_VERSION(1, 0), \ ++ 0) \ ++ x(snapshot_skiplists, BCH_VERSION(1, 1), \ ++ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, -+#define x(t, n) bcachefs_metadata_version_##t = n, ++#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, + BCH_METADATA_VERSIONS() +#undef x + bcachefs_metadata_version_max +}; + -+static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_snapshot_trees; ++static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; + +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) + @@ -9822,7 +10287,7 @@ index 000000000..4401d2767 +LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + +LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); + +LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); @@ -9842,7 +10307,7 @@ index 000000000..4401d2767 +LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); +LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); + -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, + struct bch_sb, flags[2], 0, 4); +LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + @@ -9856,6 +10321,37 @@ index 000000000..4401d2767 +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); +LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); +LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); ++LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); ++ ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, ++ struct bch_sb, flags[4], 60, 64); ++ ++LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, ++ struct bch_sb, flags[5], 0, 16); ++ ++static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) ++{ ++ return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); ++} ++ ++static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) ++{ ++ SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); ++ SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); ++} ++ ++static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) ++{ ++ return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | ++ (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); ++} ++ ++static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) ++{ ++ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); ++ SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); ++} + +/* + * Features: @@ -9923,6 +10419,17 @@ index 000000000..4401d2767 + +/* options: */ + ++#define BCH_VERSION_UPGRADE_OPTS() \ ++ x(compatible, 0) \ ++ x(incompatible, 1) \ ++ x(none, 2) ++ ++enum bch_version_upgrade_opts { ++#define x(t, n) BCH_VERSION_UPGRADE_##t = n, ++ BCH_VERSION_UPGRADE_OPTS() ++#undef x ++}; ++ +#define BCH_REPLICAS_MAX 4U + +#define BCH_BKEY_PTRS_MAX 16U @@ -10330,7 +10837,7 @@ index 000000000..4401d2767 + return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); +} + -+static inline void SET_BTREE_NODE_ID(struct btree_node *n, u64 v) ++static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) +{ + SET_BTREE_NODE_ID_LO(n, v); + SET_BTREE_NODE_ID_HI(n, v >> 4); @@ -12777,10 +13284,10 @@ index 000000000..5f42a6e69 +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 -index 000000000..79f3fbe92 +index 000000000..1381166bf --- /dev/null +++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,520 @@ +@@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -12901,17 +13408,15 @@ index 000000000..79f3fbe92 +#undef x +}; + ++const struct bkey_ops bch2_bkey_null_ops = { ++ .min_val_size = U8_MAX, ++}; ++ +int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ -+ const struct bkey_ops *ops; -+ -+ if (k.k->type >= KEY_TYPE_MAX) { -+ prt_printf(err, "invalid type (%u >= %u)", k.k->type, KEY_TYPE_MAX); -+ return -BCH_ERR_invalid_bkey; -+ } -+ -+ ops = &bch2_bkey_ops[k.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (bkey_val_bytes(k.k) < ops->min_val_size) { + prt_printf(err, "bad val size (%zu < %u)", @@ -12919,6 +13424,9 @@ index 000000000..79f3fbe92 + return -BCH_ERR_invalid_bkey; + } + ++ if (!ops->key_invalid) ++ return 0; ++ + return ops->key_invalid(c, k, flags, err); +} + @@ -12998,14 +13506,16 @@ index 000000000..79f3fbe92 + +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (k.k->u64s < BKEY_U64s) { + prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); + return -BCH_ERR_invalid_bkey; + } + -+ if (!(bch2_key_types_allowed[type] & (1U << k.k->type))) { ++ if (flags & BKEY_INVALID_COMMIT && ++ !(bch2_key_types_allowed[type] & (1U << k.k->type))) { + prt_printf(err, "invalid key type for btree %s (%s)", + bch2_btree_ids[type], bch2_bkey_types[k.k->type]); + return -BCH_ERR_invalid_bkey; @@ -13029,24 +13539,23 @@ index 000000000..79f3fbe92 + } + } + -+ if (type != BKEY_TYPE_btree && -+ !btree_type_has_snapshots(type) && -+ k.k->p.snapshot) { -+ prt_printf(err, "nonzero snapshot"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ if (type != BKEY_TYPE_btree) { ++ if (!btree_type_has_snapshots((enum btree_id) type) && ++ k.k->p.snapshot) { ++ prt_printf(err, "nonzero snapshot"); ++ return -BCH_ERR_invalid_bkey; ++ } + -+ if (type != BKEY_TYPE_btree && -+ btree_type_has_snapshots(type) && -+ !k.k->p.snapshot) { -+ prt_printf(err, "snapshot == 0"); -+ return -BCH_ERR_invalid_bkey; -+ } ++ if (btree_type_has_snapshots((enum btree_id) type) && ++ !k.k->p.snapshot) { ++ prt_printf(err, "snapshot == 0"); ++ return -BCH_ERR_invalid_bkey; ++ } + -+ if (type != BKEY_TYPE_btree && -+ bkey_eq(k.k->p, POS_MAX)) { -+ prt_printf(err, "key at POS_MAX"); -+ return -BCH_ERR_invalid_bkey; ++ if (bkey_eq(k.k->p, POS_MAX)) { ++ prt_printf(err, "key at POS_MAX"); ++ return -BCH_ERR_invalid_bkey; ++ } + } + + return 0; @@ -13054,7 +13563,8 @@ index 000000000..79f3fbe92 + +int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, type, flags, err) ?: + bch2_bkey_val_invalid(c, k, flags, err); @@ -13123,14 +13633,10 @@ index 000000000..79f3fbe92 +void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ -+ if (k.k->type < KEY_TYPE_MAX) { -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); -+ } else { -+ prt_printf(out, "(invalid type %u)", k.k->type); -+ } ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); +} + +void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, @@ -13146,7 +13652,7 @@ index 000000000..79f3fbe92 + +void bch2_bkey_swab_val(struct bkey_s k) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (ops->swab) + ops->swab(k); @@ -13154,7 +13660,7 @@ index 000000000..79f3fbe92 + +bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + return ops->key_normalize + ? ops->key_normalize(c, k) @@ -13163,11 +13669,11 @@ index 000000000..79f3fbe92 + +bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); + -+ return bch2_bkey_maybe_mergable(l.k, r.k) && ++ return ops->key_merge && ++ bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && -+ bch2_bkey_ops[l.k->type].key_merge && + !bch2_key_merging_disabled && + ops->key_merge(c, l, r); +} @@ -13267,7 +13773,7 @@ index 000000000..79f3fbe92 + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { -+ u64 min_packed = f->field_offset[BKEY_FIELD_SNAPSHOT]; ++ u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + @@ -13292,7 +13798,7 @@ index 000000000..79f3fbe92 + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_val(u); + -+ ops = &bch2_bkey_ops[k->type]; ++ ops = bch2_bkey_type_ops(k->type); + + if (ops->compat) + ops->compat(btree_id, version, big_endian, write, u); @@ -13303,10 +13809,10 @@ index 000000000..79f3fbe92 +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 -index 000000000..c2bd72f5d +index 000000000..0f3dc156a --- /dev/null +++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,169 @@ +@@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H @@ -13320,6 +13826,13 @@ index 000000000..c2bd72f5d +enum btree_node_type; + +extern const char * const bch2_bkey_types[]; ++extern const struct bkey_ops bch2_bkey_null_ops; ++ ++enum bkey_invalid_flags { ++ BKEY_INVALID_WRITE = (1U << 0), ++ BKEY_INVALID_COMMIT = (1U << 1), ++ BKEY_INVALID_JOURNAL = (1U << 2), ++}; + +/* + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If @@ -13330,7 +13843,7 @@ index 000000000..c2bd72f5d + */ +struct bkey_ops { + int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err); ++ enum bkey_invalid_flags flags, struct printbuf *err); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); @@ -13350,7 +13863,12 @@ index 000000000..c2bd72f5d + +extern const struct bkey_ops bch2_bkey_ops[]; + -+#define BKEY_INVALID_FROM_JOURNAL (1 << 1) ++static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) ++{ ++ return likely(type < KEY_TYPE_MAX) ++ ? &bch2_bkey_ops[type] ++ : &bch2_bkey_null_ops; ++} + +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, @@ -13384,7 +13902,7 @@ index 000000000..c2bd72f5d + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new.k->type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); + + return ops->atomic_trigger + ? ops->atomic_trigger(trans, btree, level, old, new, flags) @@ -13419,12 +13937,22 @@ index 000000000..c2bd72f5d +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + ++#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ ++ ((1U << KEY_TYPE_alloc)| \ ++ (1U << KEY_TYPE_alloc_v2)| \ ++ (1U << KEY_TYPE_alloc_v3)| \ ++ (1U << KEY_TYPE_alloc_v4)| \ ++ (1U << KEY_TYPE_stripe)| \ ++ (1U << KEY_TYPE_inode)| \ ++ (1U << KEY_TYPE_inode_v2)| \ ++ (1U << KEY_TYPE_snapshot)) ++ +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[old.k->type ?: new->k.type]; ++ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + + return ops->trans_trigger + ? ops->trans_trigger(trans, btree_id, level, old, new, flags) @@ -13735,10 +14263,10 @@ index 000000000..79cf11d1b +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 -index 000000000..4d5501155 +index 000000000..bcdf28f39 --- /dev/null +++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1588 @@ +@@ -0,0 +1,1587 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a @@ -14340,11 +14868,10 @@ index 000000000..4d5501155 + return (u16) v; +} + -+__always_inline -+static inline void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) ++static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); @@ -15876,10 +16403,10 @@ index 000000000..632c2b8c5 +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 -index 000000000..5ffd8db7e +index 000000000..13c88d953 --- /dev/null +++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1273 @@ +@@ -0,0 +1,1277 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -15914,13 +16441,15 @@ index 000000000..5ffd8db7e +{ + unsigned i, reserve = 16; + -+ if (!c->btree_roots[0].b) ++ if (!c->btree_roots_known[0].b) + reserve += 8; + -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ reserve += min_t(unsigned, 1, -+ c->btree_roots[i].b->c.level) * 8; ++ for (i = 0; i < btree_id_nr_alive(c); i++) { ++ struct btree_root *r = bch2_btree_id_root(c, i); ++ ++ if (r->b) ++ reserve += min_t(unsigned, 1, r->b->c.level) * 8; ++ } + + c->btree_cache.reserve = reserve; +} @@ -16339,9 +16868,12 @@ index 000000000..5ffd8db7e + + kvpfree(c->verify_ondisk, btree_bytes(c)); + -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ list_add(&c->btree_roots[i].b->list, &bc->live); ++ for (i = 0; i < btree_id_nr_alive(c); i++) { ++ struct btree_root *r = bch2_btree_id_root(c, i); ++ ++ if (r->b) ++ list_add(&r->b->list, &bc->live); ++ } + + list_splice(&bc->freeable, &bc->live); + @@ -16382,21 +16914,17 @@ index 000000000..5ffd8db7e + unsigned i; + int ret = 0; + -+ pr_verbose_init(c->opts, ""); -+ + ret = rhashtable_init(&bc->table, &bch_btree_cache_params); + if (ret) -+ goto out; ++ goto err; + + bc->table_init_done = true; + + bch2_recalc_btree_reserve(c); + + for (i = 0; i < bc->reserve; i++) -+ if (!__bch2_btree_node_mem_alloc(c)) { -+ ret = -BCH_ERR_ENOMEM_fs_btree_cache_init; -+ goto out; -+ } ++ if (!__bch2_btree_node_mem_alloc(c)) ++ goto err; + + list_splice_init(&bc->live, &bc->freeable); + @@ -16407,9 +16935,12 @@ index 000000000..5ffd8db7e + bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; + bc->shrink.seeks = 4; + ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ return -BCH_ERR_ENOMEM_fs_btree_cache_init; +} + +void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -16703,7 +17234,7 @@ index 000000000..5ffd8db7e +{ + struct printbuf buf = PRINTBUF; + -+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + return; + + prt_printf(&buf, @@ -17155,10 +17686,10 @@ index 000000000..5ffd8db7e +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 -index 000000000..4900ed454 +index 000000000..00c9b9218 --- /dev/null +++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,106 @@ +@@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H @@ -17206,7 +17737,11 @@ index 000000000..4900ed454 + case KEY_TYPE_btree_ptr: + return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); + case KEY_TYPE_btree_ptr_v2: -+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; ++ /* ++ * The cast/deref is only necessary to avoid sparse endianness ++ * warnings: ++ */ ++ return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); + default: + return 0; + } @@ -17258,7 +17793,27 @@ index 000000000..4900ed454 + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) + -+#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) ++static inline unsigned btree_id_nr_alive(struct bch_fs *c) ++{ ++ return BTREE_ID_NR + c->btree_roots_extra.nr; ++} ++ ++static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) ++{ ++ if (likely(id < BTREE_ID_NR)) { ++ return &c->btree_roots_known[id]; ++ } else { ++ unsigned idx = id - BTREE_ID_NR; ++ ++ EBUG_ON(idx >= c->btree_roots_extra.nr); ++ return &c->btree_roots_extra.data[idx]; ++ } ++} ++ ++static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) ++{ ++ return bch2_btree_id_root(c, b->c.btree_id)->b; ++} + +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + const struct btree *); @@ -17267,10 +17822,10 @@ index 000000000..4900ed454 +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 -index 000000000..529613f1d +index 000000000..be537b237 --- /dev/null +++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,2130 @@ +@@ -0,0 +1,2144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -17802,8 +18357,13 @@ index 000000000..529613f1d + + bch2_trans_init(&trans, c, 0, 0); + -+ for (i = 0; i < BTREE_ID_NR && !ret; i++) { -+ b = c->btree_roots[i].b; ++ for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) { ++ struct btree_root *r = bch2_btree_id_root(c, i); ++ ++ if (!r->alive) ++ continue; ++ ++ b = r->b; + if (btree_node_fake(b)) + continue; + @@ -18156,7 +18716,7 @@ index 000000000..529613f1d + return ret; + + mutex_lock(&c->btree_root_lock); -+ b = c->btree_roots[btree_id].b; ++ b = bch2_btree_id_root(c, btree_id)->b; + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + @@ -18279,7 +18839,7 @@ index 000000000..529613f1d + struct printbuf buf = PRINTBUF; + int ret = 0; + -+ b = c->btree_roots[btree_id].b; ++ b = bch2_btree_id_root(c, btree_id)->b; + + if (btree_node_fake(b)) + return 0; @@ -18348,6 +18908,15 @@ index 000000000..529613f1d + ? bch2_gc_btree_init(&trans, ids[i], metadata_only) + : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + ++ for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { ++ if (!bch2_btree_id_root(c, i)->alive) ++ continue; ++ ++ ret = initial ++ ? bch2_gc_btree_init(&trans, i, metadata_only) ++ : bch2_gc_btree(&trans, i, initial, metadata_only); ++ } ++ + if (ret < 0) + bch_err_fn(c, ret); + @@ -18491,7 +19060,7 @@ index 000000000..529613f1d + for_each_member_device(ca, c, dev) { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) ca->usage_gc, ++ bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); @@ -18507,7 +19076,7 @@ index 000000000..529613f1d + unsigned nr = fs_usage_u64s(c); + struct bch_fs_usage *dst = c->usage_base; + struct bch_fs_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); + + copy_fs_field(hidden, "hidden"); + copy_fs_field(btree, "btree"); @@ -19069,8 +19638,8 @@ index 000000000..529613f1d + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || + (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags) && -+ c->opts.fix_errors != FSCK_OPT_NO)) { ++ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations && ++ c->opts.fix_errors != FSCK_FIX_no)) { + bch_info(c, "Starting topology repair pass"); + ret = bch2_repair_topology(c); + if (ret) @@ -19084,7 +19653,7 @@ index 000000000..529613f1d + + if (ret == -BCH_ERR_need_topology_repair && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { ++ c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) { + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true); + ret = 0; @@ -19521,10 +20090,10 @@ index 000000000..95d803b57 +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 -index 000000000..9985ecd72 +index 000000000..fa1229eb1 --- /dev/null +++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,2261 @@ +@@ -0,0 +1,2266 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -19632,8 +20201,8 @@ index 000000000..9985ecd72 + vpfree(p, size); +} + -+static void *btree_bounce_alloc(struct bch_fs *c, size_t size, -+ bool *used_mempool) ++static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, ++ bool *used_mempool) +{ + unsigned flags = memalloc_nofs_save(); + void *p; @@ -19641,7 +20210,7 @@ index 000000000..9985ecd72 + BUG_ON(size > btree_bytes(c)); + + *used_mempool = false; -+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); ++ p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -19649,6 +20218,8 @@ index 000000000..9985ecd72 + memalloc_nofs_restore(flags); + return p; +} ++#define btree_bounce_alloc(_c, _size, _used_mempool) \ ++ alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) + +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) @@ -20044,7 +20615,7 @@ index 000000000..9985ecd72 + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, -+ c->btree_roots[b->c.btree_id].level); ++ bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + @@ -20228,7 +20799,9 @@ index 000000000..9985ecd72 + + btree_err_on(!bch2_version_compatible(version), + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, -+ "unsupported bset version %u", version); ++ "unsupported bset version %u.%u", ++ BCH_VERSION_MAJOR(version), ++ BCH_VERSION_MINOR(version)); + + if (btree_err_on(version < c->sb.version_min, + BTREE_ERR_FIXABLE, c, NULL, b, i, @@ -20240,7 +20813,8 @@ index 000000000..9985ecd72 + mutex_unlock(&c->sb_lock); + } + -+ if (btree_err_on(version > c->sb.version, ++ if (btree_err_on(BCH_VERSION_MAJOR(version) > ++ BCH_VERSION_MAJOR(c->sb.version), + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { @@ -20808,7 +21382,7 @@ index 000000000..9985ecd72 + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; -+ int err[BCH_REPLICAS_MAX]; ++ blk_status_t err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) @@ -21757,7 +22331,7 @@ index 000000000..9985ecd72 + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} + -+const char * const bch2_btree_write_types[] = { ++static const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL @@ -21788,7 +22362,7 @@ index 000000000..9985ecd72 +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 -index 000000000..c43fb60b8 +index 000000000..0cadf651e --- /dev/null +++ b/fs/bcachefs/btree_io.h @@ -0,0 +1,228 @@ @@ -21972,7 +22546,7 @@ index 000000000..c43fb60b8 + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 -+ : U32_MAX - max_packed; ++ : cpu_to_le64(U32_MAX - max_packed); + } +} + @@ -21994,7 +22568,7 @@ index 000000000..c43fb60b8 + struct btree_node *bn) +{ + if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && ++ btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + write) + bn->min_key = bpos_nosnap_predecessor(bn->min_key); @@ -22011,7 +22585,7 @@ index 000000000..c43fb60b8 + bn->max_key.snapshot = U32_MAX; + + if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && ++ btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + !write) + bn->min_key = bpos_nosnap_successor(bn->min_key); @@ -22022,7 +22596,7 @@ index 000000000..c43fb60b8 +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 -index 000000000..3c6ea6a23 +index 000000000..e292c5a2a --- /dev/null +++ b/fs/bcachefs/btree_iter.c @@ -0,0 +1,3214 @@ @@ -22266,7 +22840,7 @@ index 000000000..3c6ea6a23 + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { + BUG_ON(!path->cached && -+ c->btree_roots[path->btree_id].b->c.level > i); ++ bch2_btree_id_root(c, path->btree_id)->b->c.level > i); + break; + } + @@ -22728,8 +23302,8 @@ index 000000000..3c6ea6a23 + + if (t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(trans, path, b->c.level); -+ six_lock_increment(&b->c.lock, t); -+ mark_btree_node_locked(trans, path, b->c.level, t); ++ six_lock_increment(&b->c.lock, (enum six_lock_type) t); ++ mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); + } + + bch2_btree_path_level_init(trans, path, b); @@ -22760,7 +23334,7 @@ index 000000000..3c6ea6a23 + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; -+ struct btree *b, **rootp = &c->btree_roots[path->btree_id].b; ++ struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + enum six_lock_type lock_type; + unsigned i; + int ret; @@ -23466,7 +24040,7 @@ index 000000000..3c6ea6a23 + prt_newline(out); +} + -+noinline __cold ++static noinline __cold +void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, + bool nosort) +{ @@ -23486,7 +24060,7 @@ index 000000000..3c6ea6a23 + __bch2_trans_paths_to_text(out, trans, false); +} + -+noinline __cold ++static noinline __cold +void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) +{ + struct printbuf buf = PRINTBUF; @@ -23895,9 +24469,9 @@ index 000000000..3c6ea6a23 + : NULL; +} + -+struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end_pos) ++static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end_pos) +{ + struct bkey_i *k; + @@ -25242,10 +25816,10 @@ index 000000000..3c6ea6a23 +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 -index 000000000..13d4e9aac +index 000000000..c472aa8c5 --- /dev/null +++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,916 @@ +@@ -0,0 +1,924 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H @@ -25531,7 +26105,7 @@ index 000000000..13d4e9aac +} + +__always_inline -+static inline int btree_trans_restart_nounlock(struct btree_trans *trans, int err) ++static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +{ + BUG_ON(err <= 0); + BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); @@ -25542,7 +26116,7 @@ index 000000000..13d4e9aac +} + +__always_inline -+static inline int btree_trans_restart(struct btree_trans *trans, int err) ++static int btree_trans_restart(struct btree_trans *trans, int err) +{ + btree_trans_restart_nounlock(trans, err); + return -err; @@ -26043,6 +26617,14 @@ index 000000000..13d4e9aac + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + ++#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ ++ _start, _iter_flags, _k, \ ++ _disk_res, _journal_seq, _commit_flags,\ ++ _do) \ ++ for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ ++ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ ++ (_journal_seq), (_commit_flags))) ++ +#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ + _start, _end, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ @@ -26164,10 +26746,10 @@ index 000000000..13d4e9aac +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 -index 000000000..a71db7ab7 +index 000000000..f7c001d42 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,1087 @@ +@@ -0,0 +1,1088 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -26951,6 +27533,7 @@ index 000000000..a71db7ab7 + ck->valid = true; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_inc(&c->btree_key_cache.nr_dirty); + @@ -27249,7 +27832,7 @@ index 000000000..a71db7ab7 + +int __init bch2_btree_key_cache_init(void) +{ -+ bch2_key_cache = KMEM_CACHE(bkey_cached, 0); ++ bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); + if (!bch2_key_cache) + return -ENOMEM; + @@ -27311,10 +27894,10 @@ index 000000000..be3acde2c +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 -index 000000000..70d25ce35 +index 000000000..d7fd87149 --- /dev/null +++ b/fs/bcachefs/btree_locking.c -@@ -0,0 +1,804 @@ +@@ -0,0 +1,797 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -27914,13 +28497,6 @@ index 000000000..70d25ce35 + return 0; +} + -+__flatten -+bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, -+ struct btree_path *path, unsigned long trace_ip) -+{ -+ return btree_path_get_locks(trans, path, true); -+} -+ +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) @@ -28121,7 +28697,7 @@ index 000000000..70d25ce35 +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 -index 000000000..0ad8fd44a +index 000000000..f3e58aa27 --- /dev/null +++ b/fs/bcachefs/btree_locking.h @@ -0,0 +1,424 @@ @@ -28221,7 +28797,7 @@ index 000000000..0ad8fd44a + unsigned level, + enum six_lock_type type) +{ -+ mark_btree_node_locked_noreset(path, level, type); ++ mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[level].lock_taken_time = local_clock(); +#endif @@ -28374,7 +28950,7 @@ index 000000000..0ad8fd44a + trans_for_each_path(trans, path) + if (&path->l[level].b->c == b && + btree_node_locked_type(path, level) >= want) { -+ six_lock_increment(&b->lock, want); ++ six_lock_increment(&b->lock, (enum six_lock_type) want); + return true; + } + @@ -28394,7 +28970,7 @@ index 000000000..0ad8fd44a + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + + if (likely(six_trylock_type(&b->lock, type)) || -+ btree_node_lock_increment(trans, b, level, type) || ++ btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || + !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[b->level].lock_taken_time = local_clock(); @@ -28551,10 +29127,10 @@ index 000000000..0ad8fd44a +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 -index 000000000..d4ff72128 +index 000000000..4efc69492 --- /dev/null +++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,737 @@ +@@ -0,0 +1,742 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H @@ -29245,6 +29821,11 @@ index 000000000..d4ff72128 + return (1U << type) & BTREE_ID_IS_EXTENTS; +} + ++static inline bool btree_id_is_extents(enum btree_id btree) ++{ ++ return btree_node_type_is_extents((enum btree_node_type) btree); ++} ++ +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ @@ -29294,7 +29875,7 @@ index 000000000..d4ff72128 +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 -index 000000000..93d2e54aa +index 000000000..f794c9d10 --- /dev/null +++ b/fs/bcachefs/btree_update.h @@ -0,0 +1,357 @@ @@ -29412,7 +29993,7 @@ index 000000000..93d2e54aa +int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_buffered(struct btree_trans *, -+ enum btree_id, struct bkey_i *, bool); ++ enum btree_id, struct bkey_i *); + +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); @@ -29657,10 +30238,10 @@ index 000000000..93d2e54aa +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 -index 000000000..4c8effa4b +index 000000000..3659b2c08 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2477 @@ +@@ -0,0 +1,2488 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -30821,6 +31402,17 @@ index 000000000..4c8effa4b + bch2_err_matches(ret, ENOMEM)) { + struct closure cl; + ++ /* ++ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK ++ * flag ++ */ ++ if (bch2_err_matches(ret, ENOSPC) && ++ (flags & BTREE_INSERT_JOURNAL_RECLAIM) && ++ watermark != BCH_WATERMARK_reclaim) { ++ ret = -BCH_ERR_journal_reclaim_would_deadlock; ++ goto err; ++ } ++ + closure_init_stack(&cl); + + do { @@ -30862,7 +31454,7 @@ index 000000000..4c8effa4b + (b->c.level < btree_node_root(c, b)->c.level || + !btree_node_dying(btree_node_root(c, b)))); + -+ btree_node_root(c, b) = b; ++ bch2_btree_id_root(c, b->c.btree_id)->b = b; + mutex_unlock(&c->btree_root_lock); + + bch2_recalc_btree_reserve(c); @@ -31699,7 +32291,7 @@ index 000000000..4c8effa4b + return ret; +} + -+void async_btree_node_rewrite_work(struct work_struct *work) ++static void async_btree_node_rewrite_work(struct work_struct *work) +{ + struct async_btree_rewrite *a = + container_of(work, struct async_btree_rewrite, work); @@ -32065,7 +32657,7 @@ index 000000000..4c8effa4b + +void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) +{ -+ struct btree_root *r = &c->btree_roots[entry->btree_id]; ++ struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); + + mutex_lock(&c->btree_root_lock); + @@ -32091,15 +32683,15 @@ index 000000000..4c8effa4b + + mutex_lock(&c->btree_root_lock); + -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].alive && !test_bit(i, &have)) { -+ journal_entry_set(end, -+ BCH_JSET_ENTRY_btree_root, -+ i, c->btree_roots[i].level, -+ &c->btree_roots[i].key, -+ c->btree_roots[i].key.k.u64s); ++ for (i = 0; i < btree_id_nr_alive(c); i++) { ++ struct btree_root *r = bch2_btree_id_root(c, i); ++ ++ if (r->alive && !test_bit(i, &have)) { ++ journal_entry_set(end, BCH_JSET_ENTRY_btree_root, ++ i, r->level, &r->key, r->key.k.u64s); + end = vstruct_next(end); + } ++ } + + mutex_unlock(&c->btree_root_lock); + @@ -32474,10 +33066,10 @@ index 000000000..221b7ad5d +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 -index 000000000..b20b8c090 +index 000000000..3638cef21 --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,2050 @@ +@@ -0,0 +1,2065 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -32509,7 +33101,7 @@ index 000000000..b20b8c090 + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ -+struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) ++static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + @@ -32752,8 +33344,10 @@ index 000000000..b20b8c090 + + bch2_btree_add_journal_pin(c, b, journal_seq); + -+ if (unlikely(!btree_node_dirty(b))) ++ if (unlikely(!btree_node_dirty(b))) { ++ EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_btree_node_dirty_acct(c, b); ++ } + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; @@ -32789,7 +33383,7 @@ index 000000000..b20b8c090 + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && -+ bch2_snapshot_internal_node(trans->c, i->k->k.p.snapshot)); ++ bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); +} + +static noinline int @@ -32887,6 +33481,8 @@ index 000000000..b20b8c090 +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; ++ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); ++ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + int ret; + + verify_update_old_key(trans, i); @@ -32897,8 +33493,8 @@ index 000000000..b20b8c090 + if (!btree_node_type_needs_gc(i->btree_id)) + return 0; + -+ if (bch2_bkey_ops[old.k->type].atomic_trigger == -+ bch2_bkey_ops[i->k->k.type].atomic_trigger) { ++ if (old_ops->atomic_trigger == new_ops->atomic_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); @@ -32929,6 +33525,8 @@ index 000000000..b20b8c090 + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; ++ const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); ++ const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); + @@ -32938,8 +33536,8 @@ index 000000000..b20b8c090 + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && -+ bch2_bkey_ops[old.k->type].trans_trigger == -+ bch2_bkey_ops[i->k->k.type].trans_trigger) { ++ old_ops->trans_trigger == new_ops->trans_trigger && ++ ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, @@ -33332,10 +33930,13 @@ index 000000000..b20b8c090 + struct printbuf buf = PRINTBUF; + + trans_for_each_update(trans, i) { -+ int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; ++ enum bkey_invalid_flags invalid_flags = 0; ++ ++ if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) ++ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), -+ i->bkey_type, rw, &buf))) ++ i->bkey_type, invalid_flags, &buf))) + return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); + btree_insert_entry_checks(trans, i); + } @@ -33429,6 +34030,10 @@ index 000000000..b20b8c090 + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + break; + case -BCH_ERR_journal_res_get_blocked: ++ /* ++ * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK ++ * flag ++ */ + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; @@ -33696,7 +34301,7 @@ index 000000000..b20b8c090 + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || -+ !snapshot_t(trans->c, pos.snapshot)->children[0]) ++ bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __check_pos_snapshot_overwritten(trans, id, pos); @@ -34195,14 +34800,21 @@ index 000000000..b20b8c090 + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, -+ struct bkey_i *k, -+ bool head) ++ struct bkey_i *k) +{ -+ int ret, pos; ++ struct btree_write_buffered_key *i; ++ int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + ++ trans_for_each_wb_update(trans, i) { ++ if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { ++ bkey_copy(&i->k, k); ++ return 0; ++ } ++ } ++ + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; @@ -34229,18 +34841,13 @@ index 000000000..b20b8c090 + trans->wb_updates = u; + } + -+ if (head) { -+ memmove(&trans->wb_updates[1], -+ &trans->wb_updates[0], -+ sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); -+ pos = 0; -+ } else { -+ pos = trans->nr_wb_updates; -+ } ++ trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { ++ .btree = btree, ++ }; + -+ trans->wb_updates[pos] = (struct btree_write_buffered_key) { .btree = btree, }; -+ bkey_copy(&trans->wb_updates[pos].k, k); ++ bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; ++ + return 0; +} + @@ -34361,7 +34968,7 @@ index 000000000..b20b8c090 + + bkey_init(&k->k); + k->k.p = pos; -+ return bch2_trans_update_buffered(trans, btree, k, false); ++ return bch2_trans_update_buffered(trans, btree, k); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, @@ -34530,10 +35137,10 @@ index 000000000..b20b8c090 +} diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c new file mode 100644 -index 000000000..3a3e36c16 +index 000000000..6c30a72e6 --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.c -@@ -0,0 +1,345 @@ +@@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -34665,6 +35272,9 @@ index 000000000..3a3e36c16 + keys = wb->keys[s.idx]; + nr = s.nr; + ++ if (race_fault()) ++ goto slowpath; ++ + /* + * We first sort so that we can detect and skip redundant updates, and + * then we attempt to flush in sorted btree order, as this is most @@ -34817,7 +35427,6 @@ index 000000000..3a3e36c16 + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key *i; + union btree_write_buffer_state old, new; -+ unsigned offset = 0; + int ret = 0; + u64 v; + @@ -34825,8 +35434,7 @@ index 000000000..3a3e36c16 + EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + i->journal_seq = trans->journal_res.seq; -+ i->journal_offset = trans->journal_res.offset + offset; -+ offset++; ++ i->journal_offset = trans->journal_res.offset; + } + + preempt_disable(); @@ -34951,10 +35559,10 @@ index 000000000..99993ba77 +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 -index 000000000..ed86ad04d +index 000000000..797ef5ece --- /dev/null +++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2200 @@ +@@ -0,0 +1,2171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. @@ -35331,7 +35939,7 @@ index 000000000..ed86ad04d + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) +{ -+ struct bch_fs_usage __percpu *fs_usage; ++ struct bch_fs_usage *fs_usage; + int idx, ret = 0; + struct printbuf buf = PRINTBUF; + @@ -35905,12 +36513,14 @@ index 000000000..ed86ad04d + return 0; +} + -+static int __mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) ++int bch2_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -35986,14 +36596,6 @@ index 000000000..ed86ad04d + return 0; +} + -+int bch2_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ return mem_trigger_run_insert_then_overwrite(__mark_extent, trans, btree_id, level, old, new, flags); -+} -+ +int bch2_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, @@ -36106,7 +36708,7 @@ index 000000000..ed86ad04d + unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bch_fs_usage __percpu *fs_usage; ++ struct bch_fs_usage *fs_usage; + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { @@ -36132,12 +36734,14 @@ index 000000000..ed86ad04d + return 0; +} + -+static int __mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) ++int bch2_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; -+ struct bch_fs_usage __percpu *fs_usage; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; ++ struct bch_fs_usage *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + @@ -36163,14 +36767,6 @@ index 000000000..ed86ad04d + return 0; +} + -+int bch2_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ return mem_trigger_run_insert_then_overwrite(__mark_reservation, trans, btree_id, level, old, new, flags); -+} -+ +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, @@ -36225,11 +36821,13 @@ index 000000000..ed86ad04d + return ret; +} + -+static int __mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) ++int bch2_mark_reflink_p(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; @@ -36263,14 +36861,6 @@ index 000000000..ed86ad04d + return ret; +} + -+int bch2_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned flags) -+{ -+ return mem_trigger_run_insert_then_overwrite(__mark_reflink_p, trans, btree_id, level, old, new, flags); -+} -+ +void bch2_trans_fs_usage_revert(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ @@ -36472,11 +37062,15 @@ index 000000000..ed86ad04d + return ret; +} + -+static int __trans_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_extent(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, struct bkey_i *new, ++ unsigned flags) +{ + struct bch_fs *c = trans->c; ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; @@ -36533,14 +37127,6 @@ index 000000000..ed86ad04d + return ret; +} + -+int bch2_trans_mark_extent(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, struct bkey_i *new, -+ unsigned flags) -+{ -+ return trigger_run_insert_then_overwrite(__trans_mark_extent, trans, btree_id, level, old, new, flags); -+} -+ +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) @@ -36715,10 +37301,15 @@ index 000000000..ed86ad04d + return 0; +} + -+static int __trans_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) ++int bch2_trans_mark_reservation(struct btree_trans *trans, ++ enum btree_id btree_id, unsigned level, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ unsigned flags) +{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; @@ -36740,16 +37331,7 @@ index 000000000..ed86ad04d + return 0; +} + -+int bch2_trans_mark_reservation(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ unsigned flags) -+{ -+ return trigger_run_insert_then_overwrite(__trans_mark_reservation, trans, btree_id, level, old, new, flags); -+} -+ -+static int trans_mark_reflink_p_segment(struct btree_trans *trans, ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ @@ -36816,36 +37398,33 @@ index 000000000..ed86ad04d + return ret; +} + -+static int __trans_mark_reflink_p(struct btree_trans *trans, -+ enum btree_id btree_id, unsigned level, -+ struct bkey_s_c k, unsigned flags) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx, end_idx; -+ int ret = 0; -+ -+ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); -+ end_idx = le64_to_cpu(p.v->idx) + p.k->size + -+ le32_to_cpu(p.v->back_pad); -+ -+ while (idx < end_idx && !ret) -+ ret = trans_mark_reflink_p_segment(trans, p, &idx, flags); -+ return ret; -+} -+ +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ++ ? old ++ : bkey_i_to_s_c(new); ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx, end_idx; ++ int ret = 0; ++ + if (flags & BTREE_TRIGGER_INSERT) { -+ struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v; ++ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + + v->front_pad = v->back_pad = 0; + } + -+ return trigger_run_insert_then_overwrite(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags); ++ idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); ++ end_idx = le64_to_cpu(p.v->idx) + p.k->size + ++ le32_to_cpu(p.v->back_pad); ++ ++ while (idx < end_idx && !ret) ++ ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); ++ ++ return ret; +} + +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -37157,10 +37736,10 @@ index 000000000..ed86ad04d +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 -index 000000000..400d105fd +index 000000000..f9d7dda07 --- /dev/null +++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,371 @@ +@@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. @@ -37437,20 +38016,6 @@ index 000000000..400d105fd +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); + -+#define mem_trigger_run_insert_then_overwrite(_fn, _trans, _btree_id, _level, _old, _new, _flags)\ -+({ \ -+ int ret = 0; \ -+ \ -+ if (_new.k->type) \ -+ ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \ -+ if (_old.k->type && !ret) \ -+ ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \ -+ ret; \ -+}) -+ -+#define trigger_run_insert_then_overwrite(_fn, _trans, _btree_id, _level, _old, _new, _flags) \ -+ mem_trigger_run_insert_then_overwrite(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags) -+ +void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + @@ -38666,10 +39231,10 @@ index 000000000..3a4890d39 +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 -index 000000000..843e13886 +index 000000000..a08997a5b --- /dev/null +++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,712 @@ +@@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -39352,8 +39917,6 @@ index 000000000..843e13886 + struct bch_key key; + int ret = 0; + -+ pr_verbose_init(c->opts, ""); -+ + c->sha256 = crypto_alloc_shash("sha256", 0, 0); + ret = PTR_ERR_OR_ZERO(c->sha256); + if (ret) { @@ -39379,15 +39942,14 @@ index 000000000..843e13886 + goto out; +out: + memzero_explicit(&key, sizeof(key)); -+ pr_verbose_init(c->opts, "ret %i", ret); + return ret; +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 -index 000000000..409ad534d +index 000000000..1ad1d5f03 --- /dev/null +++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,215 @@ +@@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H @@ -39510,12 +40072,6 @@ index 000000000..409ad534d + return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); +} + -+static const unsigned bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ +static inline bool bch2_checksum_type_valid(const struct bch_fs *c, + unsigned type) +{ @@ -39891,10 +40447,10 @@ index 000000000..5fae0012d +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 -index 000000000..38a3475b1 +index 000000000..560214c15 --- /dev/null +++ b/fs/bcachefs/compress.c -@@ -0,0 +1,638 @@ +@@ -0,0 +1,712 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" @@ -40193,21 +40749,32 @@ index 000000000..38a3475b1 + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, -+ enum bch_compression_type compression_type) ++ struct bch_compression_opt compression) +{ ++ enum bch_compression_type compression_type = ++ __bch2_compression_opt_to_type[compression.type]; ++ + switch (compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4: { -+ int len = src_len; -+ int ret = LZ4_compress_destSize( -+ src, dst, -+ &len, dst_len, -+ workspace); ++ case BCH_COMPRESSION_TYPE_lz4: ++ if (compression.level < LZ4HC_MIN_CLEVEL) { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ if (len < src_len) ++ return -len; + -+ if (len < src_len) -+ return -len; ++ return ret; ++ } else { ++ int ret = LZ4_compress_HC( ++ src, dst, ++ src_len, dst_len, ++ compression.level, ++ workspace); + -+ return ret; -+ } ++ return ret ?: -1; ++ } + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src, @@ -40217,7 +40784,11 @@ index 000000000..38a3475b1 + }; + + zlib_set_workspace(&strm, workspace); -+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ zlib_deflateInit2(&strm, ++ compression.level ++ ? clamp_t(unsigned, compression.level, ++ Z_BEST_SPEED, Z_BEST_COMPRESSION) ++ : Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + @@ -40230,8 +40801,14 @@ index 000000000..38a3475b1 + return strm.total_out; + } + case BCH_COMPRESSION_TYPE_zstd: { ++ /* ++ * rescale: ++ * zstd max compression level is 22, our max level is 15 ++ */ ++ unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); ++ ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, -+ zstd_cctx_workspace_bound(&c->zstd_params.cParams)); ++ zstd_cctx_workspace_bound(¶ms.cParams)); + + /* + * ZSTD requires that when we decompress we pass in the exact @@ -40262,10 +40839,12 @@ index 000000000..38a3475b1 +static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, -+ enum bch_compression_type compression_type) ++ struct bch_compression_opt compression) +{ + struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; ++ enum bch_compression_type compression_type = ++ __bch2_compression_opt_to_type[compression.type]; + unsigned pad; + int ret = 0; + @@ -40297,7 +40876,7 @@ index 000000000..38a3475b1 + ret = attempt_compress(c, workspace, + dst_data.b, *dst_len, + src_data.b, *src_len, -+ compression_type); ++ compression); + if (ret > 0) { + *dst_len = ret; + ret = 0; @@ -40344,22 +40923,24 @@ index 000000000..38a3475b1 + BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); + BUG_ON(*dst_len & (block_bytes(c) - 1)); + BUG_ON(*src_len & (block_bytes(c) - 1)); ++ ret = compression_type; +out: + bio_unmap_or_unbounce(c, src_data); + bio_unmap_or_unbounce(c, dst_data); -+ return compression_type; ++ return ret; +err: -+ compression_type = BCH_COMPRESSION_TYPE_incompressible; ++ ret = BCH_COMPRESSION_TYPE_incompressible; + goto out; +} + +unsigned bch2_bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, -+ unsigned compression_type) ++ unsigned compression_opt) +{ + unsigned orig_dst = dst->bi_iter.bi_size; + unsigned orig_src = src->bi_iter.bi_size; ++ unsigned compression_type; + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, @@ -40367,11 +40948,9 @@ index 000000000..38a3475b1 + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + -+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) -+ compression_type = BCH_COMPRESSION_TYPE_lz4; -+ + compression_type = -+ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ __bio_compress(c, dst, dst_len, src, src_len, ++ bch2_compression_decode(compression_opt)); + + dst->bi_iter.bi_size = orig_dst; + src->bi_iter.bi_size = orig_src; @@ -40418,8 +40997,10 @@ index 000000000..38a3475b1 +} + +int bch2_check_set_has_compressed_data(struct bch_fs *c, -+ unsigned compression_type) ++ unsigned compression_opt) +{ ++ unsigned compression_type = bch2_compression_decode(compression_opt).type; ++ + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + + return compression_type @@ -40439,18 +41020,20 @@ index 000000000..38a3475b1 + mempool_exit(&c->compression_bounce[READ]); +} + -+static int _bch2_fs_compress_init(struct bch_fs *c, u64 features) ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +{ + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; -+ ZSTD_parameters params = zstd_get_params(0, c->opts.encoded_extent_max); ++ ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), ++ c->opts.encoded_extent_max); + struct { -+ unsigned feature; -+ unsigned type; -+ size_t compress_workspace; -+ size_t decompress_workspace; ++ unsigned feature; ++ enum bch_compression_type type; ++ size_t compress_workspace; ++ size_t decompress_workspace; + } compression_types[] = { -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, ++ max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize(), }, @@ -40509,42 +41092,118 @@ index 000000000..38a3475b1 + return 0; +} + -+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++static u64 compression_opt_to_feature(unsigned v) +{ -+ int ret; -+ -+ pr_verbose_init(c->opts, ""); -+ ret = _bch2_fs_compress_init(c, features); -+ pr_verbose_init(c->opts, "ret %i", ret); -+ -+ return ret; ++ unsigned type = bch2_compression_decode(v).type; ++ return 1ULL << bch2_compression_opt_to_feature[type]; +} + +int bch2_fs_compress_init(struct bch_fs *c) +{ + u64 f = c->sb.features; + -+ if (c->opts.compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; -+ -+ if (c->opts.background_compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ f |= compression_opt_to_feature(c->opts.compression); ++ f |= compression_opt_to_feature(c->opts.background_compression); + + return __bch2_fs_compress_init(c, f); ++} + ++int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ++ struct printbuf *err) ++{ ++ char *val = kstrdup(_val, GFP_KERNEL); ++ char *p = val, *type_str, *level_str; ++ struct bch_compression_opt opt = { 0 }; ++ int ret; ++ ++ if (!val) ++ return -ENOMEM; ++ ++ type_str = strsep(&p, ":"); ++ level_str = p; ++ ++ ret = match_string(bch2_compression_opts, -1, type_str); ++ if (ret < 0 && err) ++ prt_str(err, "invalid compression type"); ++ if (ret < 0) ++ goto err; ++ ++ opt.type = ret; ++ ++ if (level_str) { ++ unsigned level; ++ ++ ret = kstrtouint(level_str, 10, &level); ++ if (!ret && !opt.type && level) ++ ret = -EINVAL; ++ if (!ret && level > 15) ++ ret = -EINVAL; ++ if (ret < 0 && err) ++ prt_str(err, "invalid compression level"); ++ if (ret < 0) ++ goto err; ++ ++ opt.level = level; ++ } ++ ++ *res = bch2_compression_encode(opt); ++err: ++ kfree(val); ++ return ret; ++} ++ ++void bch2_opt_compression_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) ++{ ++ struct bch_compression_opt opt = bch2_compression_decode(v); ++ ++ prt_str(out, bch2_compression_opts[opt.type]); ++ if (opt.level) ++ prt_printf(out, ":%u", opt.level); +} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h new file mode 100644 -index 000000000..4bab1f61b +index 000000000..052ea3032 --- /dev/null +++ b/fs/bcachefs/compress.h -@@ -0,0 +1,18 @@ +@@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COMPRESS_H +#define _BCACHEFS_COMPRESS_H + +#include "extents_types.h" + ++struct bch_compression_opt { ++ u8 type:4, ++ level:4; ++}; ++ ++static inline struct bch_compression_opt bch2_compression_decode(unsigned v) ++{ ++ return (struct bch_compression_opt) { ++ .type = v & 15, ++ .level = v >> 4, ++ }; ++} ++ ++static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) ++{ ++ return opt.type|(opt.level << 4); ++} ++ ++static const unsigned __bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) ++{ ++ return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; ++} ++ +int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, + struct bch_extent_crc_unpacked *); +int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, @@ -40556,10 +41215,18 @@ index 000000000..4bab1f61b +void bch2_fs_compress_exit(struct bch_fs *); +int bch2_fs_compress_init(struct bch_fs *); + ++int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); ++void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++ ++#define bch2_opt_compression (struct bch_opt_fn) { \ ++ .parse = bch2_opt_compression_parse, \ ++ .to_text = bch2_opt_compression_to_text, \ ++} ++ +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c new file mode 100644 -index 000000000..e5587bc5a +index 000000000..442a9b806 --- /dev/null +++ b/fs/bcachefs/counters.c @@ -0,0 +1,107 @@ @@ -40570,7 +41237,7 @@ index 000000000..e5587bc5a + +/* BCH_SB_FIELD_counters */ + -+const char * const bch2_counter_names[] = { ++static const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x @@ -40592,7 +41259,7 @@ index 000000000..e5587bc5a + return 0; +}; + -+void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, ++static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); @@ -40788,10 +41455,10 @@ index 000000000..d4485fa01 +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 -index 000000000..3c918368b +index 000000000..cfc624463 --- /dev/null +++ b/fs/bcachefs/data_update.c -@@ -0,0 +1,564 @@ +@@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -41249,9 +41916,7 @@ index 000000000..3c918368b + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_MOVE| + m->data_opts.write_flags; -+ m->op.compression_type = -+ bch2_compression_opt_to_type[io_opts.background_compression ?: -+ io_opts.compression]; ++ m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + bkey_for_each_ptr(ptrs, ptr) @@ -42408,10 +43073,10 @@ index 000000000..2c37143b5 +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 -index 000000000..610dd7425 +index 000000000..065ea59ee --- /dev/null +++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,564 @@ +@@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -42499,7 +43164,8 @@ index 000000000..610dd7425 +}; + +int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned len; @@ -42633,7 +43299,7 @@ index 000000000..610dd7425 + int ret = 0; + + if (d.v->d_type == DT_SUBVOL && -+ d.v->d_parent_subvol != dir.subvol) ++ le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) + return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { @@ -42978,19 +43644,21 @@ index 000000000..610dd7425 +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 -index 000000000..bf9ea2e35 +index 000000000..b42f4a13b --- /dev/null +++ b/fs/bcachefs/dirent.h -@@ -0,0 +1,68 @@ +@@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_H +#define _BCACHEFS_DIRENT_H + +#include "str_hash.h" + ++enum bkey_invalid_flags; +extern const struct bch_hash_desc bch2_dirent_hash_desc; + -+int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ @@ -43052,10 +43720,10 @@ index 000000000..bf9ea2e35 +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 -index 000000000..52b640077 +index 000000000..de14ca3a9 --- /dev/null +++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,548 @@ +@@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" @@ -43518,30 +44186,37 @@ index 000000000..52b640077 + return ret; +} + -+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, ++ struct printbuf *err) +{ + struct bch_dev *ca; + int g; + -+ if (!strlen(buf) || !strcmp(buf, "none")) { -+ *v = 0; ++ if (!val) ++ return -EINVAL; ++ ++ if (!c) ++ return 0; ++ ++ if (!strlen(val) || !strcmp(val, "none")) { ++ *res = 0; + return 0; + } + + /* Is it a device? */ -+ ca = bch2_dev_lookup(c, buf); ++ ca = bch2_dev_lookup(c, val); + if (!IS_ERR(ca)) { -+ *v = dev_to_target(ca->dev_idx); ++ *res = dev_to_target(ca->dev_idx); + percpu_ref_put(&ca->ref); + return 0; + } + + mutex_lock(&c->sb_lock); -+ g = bch2_disk_path_find(&c->disk_sb, buf); ++ g = bch2_disk_path_find(&c->disk_sb, val); + mutex_unlock(&c->sb_lock); + + if (g >= 0) { -+ *v = group_to_target(g); ++ *res = group_to_target(g); + return 0; + } + @@ -43606,10 +44281,10 @@ index 000000000..52b640077 +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 -index 000000000..ec12584ce +index 000000000..bd7711767 --- /dev/null +++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,101 @@ +@@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H @@ -43697,9 +44372,14 @@ index 000000000..ec12584ce + +void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); + -+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + ++#define bch2_opt_target (struct bch_opt_fn) { \ ++ .parse = bch2_opt_target_parse, \ ++ .to_text = bch2_opt_target_to_text, \ ++} ++ +int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + +int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); @@ -43713,10 +44393,10 @@ index 000000000..ec12584ce +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 -index 000000000..8d091c4a0 +index 000000000..efbb7cf7a --- /dev/null +++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1957 @@ +@@ -0,0 +1,1960 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ @@ -43824,7 +44504,8 @@ index 000000000..8d091c4a0 +/* Stripes btree keys: */ + +int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + @@ -44104,7 +44785,7 @@ index 000000000..8d091c4a0 +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, -+ unsigned rw, unsigned idx, struct closure *cl) ++ blk_opf_t opf, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &buf->key.v; + unsigned offset = 0, bytes = buf->size << 9; @@ -44113,6 +44794,7 @@ index 000000000..8d091c4a0 + enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant + ? BCH_DATA_user + : BCH_DATA_parity; ++ int rw = op_is_write(opf); + + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, @@ -44138,7 +44820,7 @@ index 000000000..8d091c4a0 + + ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, + nr_iovecs, -+ rw, ++ opf, + GFP_KERNEL, + &c->ec_bioset), + struct ec_bio, bio); @@ -44741,7 +45423,7 @@ index 000000000..8d091c4a0 + int ret; + + if (!bch2_dev_get_ioref(ca, WRITE)) { -+ s->err = -EROFS; ++ s->err = -BCH_ERR_erofs_no_writes; + return; + } + @@ -45099,11 +45781,12 @@ index 000000000..8d091c4a0 + mutex_unlock(&h->lock); +} + -+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct btree_trans *trans, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy, -+ enum bch_watermark watermark) ++static struct ec_stripe_head * ++__bch2_ec_stripe_head_get(struct btree_trans *trans, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy, ++ enum bch_watermark watermark) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_head *h; @@ -45117,7 +45800,7 @@ index 000000000..8d091c4a0 + return ERR_PTR(ret); + + if (test_bit(BCH_FS_GOING_RO, &c->flags)) { -+ h = ERR_PTR(-EROFS); ++ h = ERR_PTR(-BCH_ERR_erofs_no_writes); + goto found; + } + @@ -45289,7 +45972,7 @@ index 000000000..8d091c4a0 + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); -+ BUG_ON(h->s->existing_stripe.size != h->s->existing_stripe.key.v.sectors); ++ BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors)); + + /* + * Free buckets we initially allocated - they might conflict with @@ -45490,7 +46173,7 @@ index 000000000..8d091c4a0 + } + goto unlock; +found: -+ h->s->err = -EROFS; ++ h->s->err = -BCH_ERR_erofs_no_writes; + ec_stripe_set_pending(c, h); +unlock: + mutex_unlock(&h->lock); @@ -45676,10 +46359,10 @@ index 000000000..8d091c4a0 +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 -index 000000000..64ca277ca +index 000000000..1b1848e5f --- /dev/null +++ b/fs/bcachefs/ec.h -@@ -0,0 +1,261 @@ +@@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H @@ -45688,8 +46371,10 @@ index 000000000..64ca277ca +#include "buckets_types.h" +#include "extents_types.h" + ++enum bkey_invalid_flags; ++ +int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + @@ -46059,10 +46744,10 @@ index 000000000..dc906fc91 +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 -index 000000000..1e06d95f3 +index 000000000..d5277ec73 --- /dev/null +++ b/fs/bcachefs/errcode.h -@@ -0,0 +1,243 @@ +@@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H @@ -46240,6 +46925,9 @@ index 000000000..1e06d95f3 + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ + x(EROFS, erofs_sb_err) \ ++ x(EROFS, erofs_unfixed_errors) \ ++ x(EROFS, erofs_norecovery) \ ++ x(EROFS, erofs_nochanges) \ + x(EROFS, insufficient_devices) \ + x(0, operation_blocked) \ + x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ @@ -46308,7 +46996,7 @@ index 000000000..1e06d95f3 +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 -index 000000000..b08cd23de +index 000000000..685464b8c --- /dev/null +++ b/fs/bcachefs/error.c @@ -0,0 +1,297 @@ @@ -46518,7 +47206,7 @@ index 000000000..b08cd23de + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } -+ } else if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ } else if (c->opts.fix_errors == FSCK_FIX_exit) { + prt_str(out, ", exiting"); + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { @@ -46526,7 +47214,7 @@ index 000000000..b08cd23de + ? s->fix + : c->opts.fix_errors; + -+ if (fix == FSCK_OPT_ASK) { ++ if (fix == FSCK_FIX_ask) { + int ask; + + prt_str(out, ": fix?"); @@ -46537,13 +47225,13 @@ index 000000000..b08cd23de + + if (ask >= YN_ALLNO && s) + s->fix = ask == YN_ALLNO -+ ? FSCK_OPT_NO -+ : FSCK_OPT_YES; ++ ? FSCK_FIX_no ++ : FSCK_FIX_yes; + + ret = ask & 1 + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; -+ } else if (fix == FSCK_OPT_YES || ++ } else if (fix == FSCK_FIX_yes || + (c->opts.nochanges && + !(flags & FSCK_CAN_IGNORE))) { + prt_str(out, ", fixing"); @@ -46558,7 +47246,7 @@ index 000000000..b08cd23de + } + + if (ret == -BCH_ERR_fsck_ignore && -+ (c->opts.fix_errors == FSCK_OPT_EXIT || ++ (c->opts.fix_errors == FSCK_FIX_exit || + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + @@ -46611,10 +47299,10 @@ index 000000000..b08cd23de +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 -index 000000000..edf124438 +index 000000000..7ce954005 --- /dev/null +++ b/fs/bcachefs/error.h -@@ -0,0 +1,213 @@ +@@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H @@ -46708,13 +47396,6 @@ index 000000000..edf124438 + * be able to repair: + */ + -+enum fsck_err_opts { -+ FSCK_OPT_EXIT, -+ FSCK_OPT_YES, -+ FSCK_OPT_NO, -+ FSCK_OPT_ASK, -+}; -+ +struct fsck_err_state { + struct list_head list; + const char *fmt; @@ -47027,10 +47708,10 @@ index 000000000..6f5cf4493 +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 -index 000000000..753a846ea +index 000000000..c13e0afc6 --- /dev/null +++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1390 @@ +@@ -0,0 +1,1394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet @@ -47196,7 +47877,8 @@ index 000000000..753a846ea +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", @@ -47214,7 +47896,8 @@ index 000000000..753a846ea +} + +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", @@ -47249,7 +47932,7 @@ index 000000000..753a846ea + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); + + if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && ++ btree_id_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bpos_nosnap_predecessor(bp.v->min_key) @@ -47404,7 +48087,8 @@ index 000000000..753a846ea +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + @@ -47547,7 +48231,7 @@ index 000000000..753a846ea + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); -+ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); @@ -48136,7 +48820,8 @@ index 000000000..753a846ea +} + +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; @@ -48423,10 +49108,10 @@ index 000000000..753a846ea +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 -index 000000000..c573a40d3 +index 000000000..d359b3fda --- /dev/null +++ b/fs/bcachefs/extents.h -@@ -0,0 +1,758 @@ +@@ -0,0 +1,757 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H @@ -48437,6 +49122,7 @@ index 000000000..c573a40d3 + +struct bch_fs; +struct btree_trans; ++enum bkey_invalid_flags; + +/* extent entries: */ + @@ -48583,11 +49269,7 @@ index 000000000..c573a40d3 + common_fields(crc->crc32), + }; + -+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; -+ -+ memcpy(&ret.csum.lo, &crc->crc32.csum, -+ sizeof(crc->crc32.csum)); -+ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); + return ret; + } + case BCH_EXTENT_ENTRY_crc64: { @@ -48597,8 +49279,8 @@ index 000000000..c573a40d3 + .csum.lo = (__force __le64) crc->crc64.csum_lo, + }; + -+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; -+ ++ u16 hi = crc->crc64.csum_hi; ++ memcpy(&ret.csum.hi, &hi, sizeof(hi)); + return ret; + } + case BCH_EXTENT_ENTRY_crc128: { @@ -48815,11 +49497,13 @@ index 000000000..c573a40d3 + +/* KEY_TYPE_btree_ptr: */ + -+int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + -+int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); @@ -48859,7 +49543,7 @@ index 000000000..c573a40d3 +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + @@ -50209,10 +50893,10 @@ index 000000000..dde237859 +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 -index 000000000..45858174c +index 000000000..6b691b2b5 --- /dev/null +++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3948 @@ +@@ -0,0 +1,3982 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + @@ -50250,6 +50934,8 @@ index 000000000..45858174c + +#include + ++static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned); ++ +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; @@ -50673,7 +51359,7 @@ index 000000000..45858174c +#undef x +}; + -+const char * const bch2_folio_sector_states[] = { ++static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x @@ -51212,7 +51898,7 @@ index 000000000..45858174c + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); -+ int ret; ++ vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; @@ -51254,7 +51940,7 @@ index 000000000..45858174c + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; -+ int ret; ++ vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + @@ -52187,7 +52873,7 @@ index 000000000..45858174c + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; -+ unsigned f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter); ++ unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + + if (!f_copied) { + folios_trunc(&folios, fi); @@ -53588,6 +54274,8 @@ index 000000000..45858174c + struct quota_res quota_res = { 0 }; + struct bkey_s_c k; + unsigned sectors; ++ bool is_allocation; ++ u64 hole_start, hole_end; + u32 snapshot; + + bch2_trans_begin(&trans); @@ -53603,6 +54291,10 @@ index 000000000..45858174c + if ((ret = bkey_err(k))) + goto bkey_err; + ++ hole_start = iter.pos.offset; ++ hole_end = bpos_min(k.k->p, end_pos).offset; ++ is_allocation = bkey_extent_is_allocation(k.k); ++ + /* already reserved */ + if (bkey_extent_is_reservation(k) && + bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { @@ -53616,17 +54308,26 @@ index 000000000..45858174c + continue; + } + -+ /* -+ * XXX: for nocow mode, we should promote shared extents to -+ * unshared here -+ */ ++ if (!(mode & FALLOC_FL_ZERO_RANGE)) { ++ ret = drop_locks_do(&trans, ++ (bch2_clamp_data_hole(&inode->v, ++ &hole_start, ++ &hole_end, ++ opts.data_replicas), 0)); ++ bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + -+ sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset; ++ if (ret) ++ goto bkey_err; + -+ if (!bkey_extent_is_allocation(k.k)) { ++ if (hole_start == hole_end) ++ continue; ++ } ++ ++ sectors = hole_end - hole_start; ++ ++ if (!is_allocation) { + ret = bch2_quota_reservation_add(c, inode, -+ "a_res, -+ sectors, true); ++ "a_res, sectors, true); + if (unlikely(ret)) + goto bkey_err; + } @@ -53638,15 +54339,15 @@ index 000000000..45858174c + goto bkey_err; + + i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++ ++ drop_locks_do(&trans, ++ (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + -+ bch2_trans_unlock(&trans); /* lock ordering, before taking pagecache locks: */ -+ mark_pagecache_reserved(inode, start_sector, iter.pos.offset); -+ + if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; @@ -53894,14 +54595,16 @@ index 000000000..45858174c + +/* fseek: */ + -+static int folio_data_offset(struct folio *folio, loff_t pos) ++static int folio_data_offset(struct folio *folio, loff_t pos, ++ unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) -+ if (s->s[i].state >= SECTOR_dirty) ++ if (s->s[i].state >= SECTOR_dirty && ++ s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; @@ -53909,7 +54612,8 @@ index 000000000..45858174c + +static loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, -+ loff_t end_offset) ++ loff_t end_offset, ++ unsigned min_replicas) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; @@ -53928,7 +54632,8 @@ index 000000000..45858174c + + folio_lock(folio); + offset = folio_data_offset(folio, -+ max(folio_pos(folio), start_offset)); ++ max(folio_pos(folio), start_offset), ++ min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); @@ -53990,7 +54695,7 @@ index 000000000..45858174c + + if (next_data > offset) + next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data); ++ offset, next_data, 0); + + if (next_data >= isize) + return -ENXIO; @@ -53998,7 +54703,8 @@ index 000000000..45858174c + return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); +} + -+static bool folio_hole_offset(struct address_space *mapping, loff_t *offset) ++static bool folio_hole_offset(struct address_space *mapping, loff_t *offset, ++ unsigned min_replicas) +{ + struct folio *folio; + struct bch_folio *s; @@ -54015,7 +54721,8 @@ index 000000000..45858174c + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) -+ if (s->s[i].state < SECTOR_dirty) { ++ if (s->s[i].state < SECTOR_dirty || ++ s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; @@ -54030,18 +54737,34 @@ index 000000000..45858174c + +static loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, -+ loff_t end_offset) ++ loff_t end_offset, ++ unsigned min_replicas) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + + while (offset < end_offset && -+ !folio_hole_offset(mapping, &offset)) ++ !folio_hole_offset(mapping, &offset, min_replicas)) + ; + + return min(offset, end_offset); +} + ++static void bch2_clamp_data_hole(struct inode *inode, ++ u64 *hole_start, ++ u64 *hole_end, ++ unsigned min_replicas) ++{ ++ *hole_start = bch2_seek_pagecache_hole(inode, ++ *hole_start << 9, *hole_end << 9, min_replicas) >> 9; ++ ++ if (*hole_start == *hole_end) ++ return; ++ ++ *hole_end = bch2_seek_pagecache_data(inode, ++ *hole_start << 9, *hole_end << 9, min_replicas) >> 9; ++} ++ +static loff_t bch2_seek_hole(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); @@ -54071,12 +54794,12 @@ index 000000000..45858174c + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE); ++ offset, MAX_LFS_FILESIZE, 0); + break; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9); ++ k.k->p.offset << 9, 0); + + if (next_hole < k.k->p.offset << 9) + break; @@ -54133,10 +54856,6 @@ index 000000000..45858174c + +int bch2_fs_fsio_init(struct bch_fs *c) +{ -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.wbio.bio), + BIOSET_NEED_BVECS)) @@ -54156,8 +54875,7 @@ index 000000000..45858174c + 1, offsetof(struct nocow_flush, bio), 0)) + return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; + -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; ++ return 0; +} + +#endif /* NO_BCACHEFS_FS */ @@ -54872,7 +55590,7 @@ index 000000000..f201980ef +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 -index 000000000..b1bafbeae +index 000000000..8d2f388b4 --- /dev/null +++ b/fs/bcachefs/fs.c @@ -0,0 +1,1943 @@ @@ -56804,7 +57522,7 @@ index 000000000..b1bafbeae +{ + int ret = -ENOMEM; + -+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + if (!bch2_inode_cache) + goto err; + @@ -56821,10 +57539,10 @@ index 000000000..b1bafbeae +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 -index 000000000..2e63cb660 +index 000000000..6170d214d --- /dev/null +++ b/fs/bcachefs/fs.h -@@ -0,0 +1,206 @@ +@@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H @@ -57023,6 +57741,8 @@ index 000000000..2e63cb660 + +#else + ++#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) ++ +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + snapshot_id_list *s) {} +static inline void bch2_vfs_exit(void) {} @@ -57033,10 +57753,10 @@ index 000000000..2e63cb660 +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 -index 000000000..674018a58 +index 000000000..7edd4632d --- /dev/null +++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,2505 @@ +@@ -0,0 +1,2452 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -57389,7 +58109,7 @@ index 000000000..674018a58 + } + + /* -+ * The check_dirents pass has already run, dangling dirents ++ * The bch2_check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + return __lookup_inode(trans, inum, lostfound, &snapshot); @@ -57933,7 +58653,7 @@ index 000000000..674018a58 + * particular is not atomic, so on the internal snapshot nodes + * we can see inodes marked for deletion after a clean shutdown + */ -+ if (bch2_snapshot_internal_node(c, k.k->p.snapshot)) ++ if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) + return 0; + + if (!bkey_is_inode(k.k)) @@ -58047,8 +58767,9 @@ index 000000000..674018a58 +} + +noinline_for_stack -+static int check_inodes(struct bch_fs *c, bool full) ++int bch2_check_inodes(struct bch_fs *c) +{ ++ bool full = c->opts.fsck; + struct btree_trans trans; + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; @@ -58443,8 +59164,7 @@ index 000000000..674018a58 + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent + */ -+noinline_for_stack -+static int check_extents(struct bch_fs *c) ++int bch2_check_extents(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; @@ -58458,8 +59178,6 @@ index 000000000..674018a58 + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + -+ bch_verbose(c, "checking extents"); -+ + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, @@ -58735,8 +59453,8 @@ index 000000000..674018a58 + goto err; + + if (fsck_err_on(ret, c, -+ "dirent points to missing subvolume %llu", -+ le64_to_cpu(d.v->d_child_subvol))) { ++ "dirent points to missing subvolume %u", ++ le32_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); + goto err; + } @@ -58811,8 +59529,7 @@ index 000000000..674018a58 + * Walk dirents: verify that they all have a corresponding S_ISDIR inode, + * validate d_type + */ -+noinline_for_stack -+static int check_dirents(struct bch_fs *c) ++int bch2_check_dirents(struct bch_fs *c) +{ + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); @@ -58823,8 +59540,6 @@ index 000000000..674018a58 + struct bkey_s_c k; + int ret = 0; + -+ bch_verbose(c, "checking dirents"); -+ + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + @@ -58886,8 +59601,7 @@ index 000000000..674018a58 +/* + * Walk xattrs: verify that they all have a corresponding inode + */ -+noinline_for_stack -+static int check_xattrs(struct bch_fs *c) ++int bch2_check_xattrs(struct bch_fs *c) +{ + struct inode_walker inode = inode_walker_init(); + struct bch_hash_info hash_info; @@ -58896,8 +59610,6 @@ index 000000000..674018a58 + struct bkey_s_c k; + int ret = 0; + -+ bch_verbose(c, "checking xattrs"); -+ + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, @@ -58971,13 +59683,10 @@ index 000000000..674018a58 +} + +/* Get root directory, create if it doesn't exist: */ -+noinline_for_stack -+static int check_root(struct bch_fs *c) ++int bch2_check_root(struct bch_fs *c) +{ + int ret; + -+ bch_verbose(c, "checking root directory"); -+ + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, @@ -59128,11 +59837,10 @@ index 000000000..674018a58 + +/* + * Check for unreachable inodes, as well as loops in the directory structure: -+ * After check_dirents(), if an inode backpointer doesn't exist that means it's ++ * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's + * unreachable: + */ -+noinline_for_stack -+static int check_directory_structure(struct bch_fs *c) ++int bch2_check_directory_structure(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; @@ -59173,6 +59881,8 @@ index 000000000..674018a58 + return ret; +} + ++/* check_nlink pass: */ ++ +struct nlink_table { + size_t nr; + size_t size; @@ -59277,7 +59987,7 @@ index 000000000..674018a58 + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ -+ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ if (S_ISDIR(u.bi_mode)) + continue; + + if (!u.bi_nlink) @@ -59363,7 +60073,7 @@ index 000000000..674018a58 + + BUG_ON(bch2_inode_unpack(k, &u)); + -+ if (S_ISDIR(le16_to_cpu(u.bi_mode))) ++ if (S_ISDIR(u.bi_mode)) + return 0; + + if (!u.bi_nlink) @@ -59415,15 +60125,12 @@ index 000000000..674018a58 + return 0; +} + -+noinline_for_stack -+static int check_nlinks(struct bch_fs *c) ++int bch2_check_nlinks(struct bch_fs *c) +{ + struct nlink_table links = { 0 }; + u64 this_iter_range_start, next_iter_range_start = 0; + int ret = 0; + -+ bch_verbose(c, "checking inode nlinks"); -+ + do { + this_iter_range_start = next_iter_range_start; + next_iter_range_start = U64_MAX; @@ -59481,8 +60188,7 @@ index 000000000..674018a58 + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); +} + -+noinline_for_stack -+static int fix_reflink_p(struct bch_fs *c) ++int bch2_fix_reflink_p(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -59491,8 +60197,6 @@ index 000000000..674018a58 + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + -+ bch_verbose(c, "fixing reflink_p keys"); -+ + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_extents, POS_MIN, @@ -59505,63 +60209,32 @@ index 000000000..674018a58 + bch_err_fn(c, ret); + return ret; +} -+ -+/* -+ * Checks for inconsistencies that shouldn't happen, unless we have a bug. -+ * Doesn't fix them yet, mainly because they haven't yet been observed: -+ */ -+int bch2_fsck_full(struct bch_fs *c) -+{ -+ int ret; -+again: -+ ret = bch2_fs_check_snapshot_trees(c); -+ bch2_fs_check_snapshots(c) ?: -+ bch2_fs_check_subvols(c) ?: -+ bch2_delete_dead_snapshots(c) ?: -+ check_inodes(c, true) ?: -+ check_extents(c) ?: -+ check_dirents(c) ?: -+ check_xattrs(c) ?: -+ check_root(c) ?: -+ check_directory_structure(c) ?: -+ check_nlinks(c) ?: -+ fix_reflink_p(c); -+ -+ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); -+ goto again; -+ } -+ -+ return ret; -+} -+ -+int bch2_fsck_walk_inodes_only(struct bch_fs *c) -+{ -+ return bch2_fs_check_snapshots(c) ?: -+ bch2_fs_check_subvols(c) ?: -+ bch2_delete_dead_snapshots(c) ?: -+ check_inodes(c, false); -+} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h new file mode 100644 -index 000000000..264f2706b +index 000000000..90c87b508 --- /dev/null +++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,8 @@ +@@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FSCK_H +#define _BCACHEFS_FSCK_H + -+int bch2_fsck_full(struct bch_fs *); -+int bch2_fsck_walk_inodes_only(struct bch_fs *); ++int bch2_check_inodes(struct bch_fs *); ++int bch2_check_extents(struct bch_fs *); ++int bch2_check_dirents(struct bch_fs *); ++int bch2_check_xattrs(struct bch_fs *); ++int bch2_check_root(struct bch_fs *); ++int bch2_check_directory_structure(struct bch_fs *); ++int bch2_check_nlinks(struct bch_fs *); ++int bch2_fix_reflink_p(struct bch_fs *); + +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 -index 000000000..64e8d1f8a +index 000000000..fa435d865 --- /dev/null +++ b/fs/bcachefs/inode.c -@@ -0,0 +1,868 @@ +@@ -0,0 +1,872 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -59996,7 +60669,8 @@ index 000000000..64e8d1f8a +} + +int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + @@ -60010,7 +60684,8 @@ index 000000000..64e8d1f8a +} + +int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + @@ -60024,7 +60699,8 @@ index 000000000..64e8d1f8a +} + +int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + @@ -60081,7 +60757,8 @@ index 000000000..64e8d1f8a +} + +int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (k.k->p.inode) { + prt_printf(err, "nonzero k.p.inode"); @@ -60432,10 +61109,10 @@ index 000000000..64e8d1f8a +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 -index 000000000..0c3022d3f +index 000000000..8f9be5e58 --- /dev/null +++ b/fs/bcachefs/inode.h -@@ -0,0 +1,192 @@ +@@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H @@ -60443,11 +61120,15 @@ index 000000000..0c3022d3f +#include "bkey.h" +#include "opts.h" + ++enum bkey_invalid_flags; +extern const char * const bch2_inode_opts[]; + -+int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); -+int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); ++int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode ((struct bkey_ops) { \ @@ -60482,7 +61163,7 @@ index 000000000..0c3022d3f +} + +int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ @@ -60630,7 +61311,7 @@ index 000000000..0c3022d3f +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 -index 000000000..f9c948b7b +index 000000000..8604df80a --- /dev/null +++ b/fs/bcachefs/io.c @@ -0,0 +1,3056 @@ @@ -61714,7 +62395,7 @@ index 000000000..f9c948b7b + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.compressed_size <= wp->sectors_free && -+ (op->crc.compression_type == op->compression_type || ++ (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && + op->csum_type != op->crc.csum_type && @@ -61762,7 +62443,7 @@ index 000000000..f9c948b7b + /* + * If we want to compress the data, it has to be decrypted: + */ -+ if ((op->compression_type || ++ if ((op->compression_opt || + bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(op->csum_type)) && + bch2_write_decrypt(op)) @@ -61809,7 +62490,7 @@ index 000000000..f9c948b7b + } + + if (ec_buf || -+ op->compression_type || ++ op->compression_opt || + (op->csum_type && + !(op->flags & BCH_WRITE_PAGES_STABLE)) || + (bch2_csum_type_is_encryption(op->csum_type) && @@ -61832,16 +62513,16 @@ index 000000000..f9c948b7b + dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + -+ BUG_ON(op->compression_type && ++ BUG_ON(op->compression_opt && + (op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_csum_type_is_encryption(op->crc.csum_type)); -+ BUG_ON(op->compression_type && !bounce); ++ BUG_ON(op->compression_opt && !bounce); + + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible -+ : op->compression_type ++ : op->compression_opt + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, -+ op->compression_type) ++ op->compression_opt) + : 0; + if (!crc_is_compressed(crc)) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); @@ -62281,7 +62962,7 @@ index 000000000..f9c948b7b + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* We can retry this: */ -+ ret = BCH_ERR_transaction_restart; ++ ret = -BCH_ERR_transaction_restart; + goto out; +} + @@ -62514,7 +63195,7 @@ index 000000000..f9c948b7b + op->end_io(op); +} + -+const char * const bch2_write_flags[] = { ++static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x @@ -63692,7 +64373,7 @@ index 000000000..f9c948b7b +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 -index 000000000..7a243a5f3 +index 000000000..1476380d5 --- /dev/null +++ b/fs/bcachefs/io.h @@ -0,0 +1,202 @@ @@ -63784,7 +64465,7 @@ index 000000000..7a243a5f3 + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); -+ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; @@ -63900,7 +64581,7 @@ index 000000000..7a243a5f3 +#endif /* _BCACHEFS_IO_H */ diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h new file mode 100644 -index 000000000..0fbdfbf90 +index 000000000..737f16d78 --- /dev/null +++ b/fs/bcachefs/io_types.h @@ -0,0 +1,165 @@ @@ -64021,8 +64702,8 @@ index 000000000..0fbdfbf90 + u16 flags; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ + ++ unsigned compression_opt:8; + unsigned csum_type:4; -+ unsigned compression_type:4; + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; + unsigned watermark:3; @@ -64071,10 +64752,10 @@ index 000000000..0fbdfbf90 +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 -index 000000000..f33ab45b7 +index 000000000..80a612c05 --- /dev/null +++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1448 @@ +@@ -0,0 +1,1438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions @@ -64571,7 +65252,7 @@ index 000000000..f33ab45b7 + } + + return ret == JOURNAL_ERR_insufficient_devices -+ ? -EROFS ++ ? -BCH_ERR_erofs_journal_err + : -BCH_ERR_journal_res_get_blocked; +} + @@ -65297,12 +65978,8 @@ index 000000000..f33ab45b7 + +int bch2_fs_journal_init(struct journal *j) +{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); + static struct lock_class_key res_key; + unsigned i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); + + spin_lock_init(&j->lock); + spin_lock_init(&j->err_lock); @@ -65319,24 +65996,18 @@ index 000000000..f33ab45b7 + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) { -+ ret = -BCH_ERR_ENOMEM_journal_pin_fifo; -+ goto out; -+ } ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) ++ return -BCH_ERR_ENOMEM_journal_pin_fifo; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); -+ if (!j->buf[i].data) { -+ ret = -BCH_ERR_ENOMEM_journal_buf; -+ goto out; -+ } ++ if (!j->buf[i].data) ++ return -BCH_ERR_ENOMEM_journal_buf; + } + + j->pin.front = j->pin.back = 1; -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; ++ return 0; +} + +/* debug: */ @@ -66057,10 +66728,10 @@ index 000000000..008a2e25a +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 -index 000000000..a084c6d0f +index 000000000..f861ae2f1 --- /dev/null +++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1858 @@ +@@ -0,0 +1,1863 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" @@ -66403,7 +67074,8 @@ index 000000000..a084c6d0f + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, -+ k, version, big_endian, write|BKEY_INVALID_FROM_JOURNAL); ++ k, version, big_endian, ++ write|BKEY_INVALID_JOURNAL); + if (ret == FSCK_DELETED_KEY) + continue; + @@ -66809,9 +67481,11 @@ index 000000000..a084c6d0f + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, -+ "%s sector %llu seq %llu: incompatible journal entry version %u", ++ "%s sector %llu seq %llu: incompatible journal entry version %u.%u", + ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), version)) { ++ sector, le64_to_cpu(jset->seq), ++ BCH_VERSION_MAJOR(version), ++ BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } @@ -66856,9 +67530,11 @@ index 000000000..a084c6d0f + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, -+ "%s sector %llu seq %llu: unknown journal entry version %u", ++ "%s sector %llu seq %llu: unknown journal entry version %u.%u", + ca ? ca->name : c->name, -+ sector, le64_to_cpu(jset->seq), version)) { ++ sector, le64_to_cpu(jset->seq), ++ BCH_VERSION_MAJOR(version), ++ BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } @@ -67991,10 +68667,10 @@ index 000000000..8801e9810 +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 -index 000000000..2c7f8aca9 +index 000000000..8de83e103 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,863 @@ +@@ -0,0 +1,873 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -68358,7 +69034,7 @@ index 000000000..2c7f8aca9 + spin_unlock(&j->lock); +} + -+enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) ++static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +{ + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) @@ -68834,8 +69510,18 @@ index 000000000..2c7f8aca9 + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + -+ seq = 0; ++ /* ++ * Now that we've populated replicas_gc, write to the journal to mark ++ * active journal devices. This handles the case where the journal might ++ * be empty. Otherwise we could clear all journal replicas and ++ * temporarily put the fs into an unrecoverable state. Journal recovery ++ * expects to find devices marked for journal data on unclean mount. ++ */ ++ ret = bch2_journal_meta(&c->journal); ++ if (ret) ++ goto err; + ++ seq = 0; + spin_lock(&j->lock); + while (!ret) { + struct bch_replicas_padded replicas; @@ -68852,7 +69538,7 @@ index 000000000..2c7f8aca9 + spin_lock(&j->lock); + } + spin_unlock(&j->lock); -+ ++err: + ret = bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + @@ -68952,7 +69638,7 @@ index 000000000..0fd1af120 +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 -index 000000000..fcefbbe7e +index 000000000..cc41bff86 --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,219 @@ @@ -69159,16 +69845,16 @@ index 000000000..fcefbbe7e + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + -+ j->d[dst].start = le64_to_cpu(buckets[0]); -+ j->d[dst].nr = le64_to_cpu(1); ++ j->d[dst].start = cpu_to_le64(buckets[0]); ++ j->d[dst].nr = cpu_to_le64(1); + + for (i = 1; i < nr; i++) { + if (buckets[i] == buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; -+ j->d[dst].start = le64_to_cpu(buckets[i]); -+ j->d[dst].nr = le64_to_cpu(1); ++ j->d[dst].start = cpu_to_le64(buckets[i]); ++ j->d[dst].nr = cpu_to_le64(1); + } + } + @@ -70074,7 +70760,7 @@ index 000000000..4b3ff7d8a +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 -index 000000000..d32425116 +index 000000000..07d192953 --- /dev/null +++ b/fs/bcachefs/lru.c @@ -0,0 +1,178 @@ @@ -70091,7 +70777,8 @@ index 000000000..d32425116 + +/* KEY_TYPE_lru is obsolete: */ +int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (!lru_pos_time(k.k->p)) { + prt_printf(err, "lru entry at time=0"); @@ -70141,8 +70828,7 @@ index 000000000..d32425116 + EBUG_ON(lru_pos_time(k->k.p) != time); + EBUG_ON(k->k.p.offset != dev_bucket); + -+ return bch2_trans_update_buffered(trans, BTREE_ID_lru, k, -+ key_type == KEY_TYPE_deleted); ++ return bch2_trans_update_buffered(trans, BTREE_ID_lru, k); +} + +int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) @@ -70258,10 +70944,10 @@ index 000000000..d32425116 +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 -index 000000000..adb984292 +index 000000000..7a3be20a8 --- /dev/null +++ b/fs/bcachefs/lru.h -@@ -0,0 +1,63 @@ +@@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H @@ -70307,7 +70993,8 @@ index 000000000..adb984292 + return BCH_LRU_read; +} + -+int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +void bch2_lru_pos_to_text(struct printbuf *, struct bpos); @@ -70528,10 +71215,10 @@ index 000000000..027efaa0d +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 -index 000000000..37fb3784a +index 000000000..052726739 --- /dev/null +++ b/fs/bcachefs/move.c -@@ -0,0 +1,1162 @@ +@@ -0,0 +1,1168 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -71166,7 +71853,7 @@ index 000000000..37fb3784a + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + + for (id = start_btree_id; -+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { + stats->btree_id = id; + @@ -71174,6 +71861,9 @@ index 000000000..37fb3784a + id != BTREE_ID_reflink) + continue; + ++ if (!bch2_btree_id_root(c, id)->b) ++ continue; ++ + ret = __bch2_move_data(&ctxt, + id == start_btree_id ? start_pos : POS_MIN, + id == end_btree_id ? end_pos : POS_MAX, @@ -71395,10 +72085,13 @@ index 000000000..37fb3784a + stats->data_type = BCH_DATA_btree; + + for (id = start_btree_id; -+ id <= min_t(unsigned, end_btree_id, BTREE_ID_NR - 1); ++ id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { + stats->btree_id = id; + ++ if (!bch2_btree_id_root(c, id)->b) ++ continue; ++ + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: @@ -72495,10 +73188,10 @@ index 000000000..bd12bf677 + diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 -index 000000000..a05c38983 +index 000000000..9dcc61ee5 --- /dev/null +++ b/fs/bcachefs/opts.c -@@ -0,0 +1,550 @@ +@@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -72506,6 +73199,7 @@ index 000000000..a05c38983 +#include "bcachefs.h" +#include "compress.h" +#include "disk_groups.h" ++#include "error.h" +#include "opts.h" +#include "super-io.h" +#include "util.h" @@ -72517,6 +73211,16 @@ index 000000000..a05c38983 + NULL +}; + ++const char * const bch2_fsck_fix_opts[] = { ++ BCH_FIX_ERRORS_OPTS() ++ NULL ++}; ++ ++const char * const bch2_version_upgrade_opts[] = { ++ BCH_VERSION_UPGRADE_OPTS() ++ NULL ++}; ++ +const char * const bch2_sb_features[] = { + BCH_SB_FEATURES() + NULL @@ -72585,6 +73289,37 @@ index 000000000..a05c38983 + +#undef x + ++int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, ++ struct printbuf *err) ++{ ++ if (!val) { ++ *res = FSCK_FIX_yes; ++ } else { ++ int ret = match_string(bch2_fsck_fix_opts, -1, val); ++ ++ if (ret < 0 && err) ++ prt_str(err, "fix_errors: invalid selection"); ++ if (ret < 0) ++ return ret; ++ *res = ret; ++ } ++ ++ return 0; ++} ++ ++void bch2_opt_fix_errors_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_sb *sb, ++ u64 v) ++{ ++ prt_str(out, bch2_fsck_fix_opts[v]); ++} ++ ++static const struct bch_opt_fn bch2_opt_fix_errors = { ++ .parse = bch2_opt_fix_errors_parse, ++ .to_text = bch2_opt_fix_errors_to_text, ++}; ++ +const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", @@ -72663,11 +73398,9 @@ index 000000000..a05c38983 +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ + .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, \ -+ .min = 0, .max = ARRAY_SIZE(_choices),\ ++ .min = 0, .max = ARRAY_SIZE(_choices), \ + .choices = _choices -+#define OPT_FN(_fn) .type = BCH_OPT_FN, \ -+ .parse = _fn##_parse, \ -+ .to_text = _fn##_to_text ++#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn + +#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ + [Opt_##_name] = { \ @@ -72763,15 +73496,26 @@ index 000000000..a05c38983 + + switch (opt->type) { + case BCH_OPT_BOOL: -+ ret = kstrtou64(val, 10, res); ++ if (val) { ++ ret = kstrtou64(val, 10, res); ++ } else { ++ ret = 0; ++ *res = 1; ++ } ++ + if (ret < 0 || (*res != 0 && *res != 1)) { + if (err) -+ prt_printf(err, "%s: must be bool", -+ opt->attr.name); ++ prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } + break; + case BCH_OPT_UINT: ++ if (!val) { ++ prt_printf(err, "%s: required value", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); @@ -72783,6 +73527,12 @@ index 000000000..a05c38983 + } + break; + case BCH_OPT_STR: ++ if (!val) { ++ prt_printf(err, "%s: required value", ++ opt->attr.name); ++ return -EINVAL; ++ } ++ + ret = match_string(opt->choices, -1, val); + if (ret < 0) { + if (err) @@ -72794,10 +73544,7 @@ index 000000000..a05c38983 + *res = ret; + break; + case BCH_OPT_FN: -+ if (!c) -+ return 0; -+ -+ ret = opt->parse(c, val, res); ++ ret = opt->fn.parse(c, val, res, err); + if (ret < 0) { + if (err) + prt_printf(err, "%s: parse error", @@ -72837,10 +73584,10 @@ index 000000000..a05c38983 + if (flags & OPT_SHOW_FULL_LIST) + prt_string_option(out, opt->choices, v); + else -+ prt_printf(out, "%s", opt->choices[v]); ++ prt_str(out, opt->choices[v]); + break; + case BCH_OPT_FN: -+ opt->to_text(out, c, sb, v); ++ opt->fn.to_text(out, c, sb, v); + break; + default: + BUG(); @@ -72901,31 +73648,19 @@ index 000000000..a05c38983 + name = strsep(&opt, "="); + val = opt; + -+ if (val) { -+ id = bch2_mount_opt_lookup(name); -+ if (id < 0) -+ goto bad_opt; ++ id = bch2_mount_opt_lookup(name); + -+ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); -+ if (ret < 0) -+ goto bad_val; -+ } else { -+ id = bch2_mount_opt_lookup(name); -+ v = 1; -+ -+ if (id < 0 && -+ !strncmp("no", name, 2)) { -+ id = bch2_mount_opt_lookup(name + 2); -+ v = 0; -+ } -+ -+ if (id < 0) -+ goto bad_opt; -+ -+ if (bch2_opt_table[id].type != BCH_OPT_BOOL) -+ goto no_val; ++ /* Check for the form "noopt", negation of a boolean opt: */ ++ if (id < 0 && ++ !val && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ val = "0"; + } + ++ if (id < 0) ++ goto bad_opt; ++ + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + @@ -72938,6 +73673,10 @@ index 000000000..a05c38983 + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + ++ ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); ++ if (ret < 0) ++ goto bad_val; ++ + bch2_opt_set_by_id(opts, id, v); + } + @@ -72952,10 +73691,6 @@ index 000000000..a05c38983 + pr_err("Invalid mount option %s", err.buf); + ret = -1; + goto out; -+no_val: -+ pr_err("Mount option %s requires a value", name); -+ ret = -1; -+ goto out; +out: + kfree(copied_opts_start); + printbuf_exit(&err); @@ -73051,10 +73786,10 @@ index 000000000..a05c38983 +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 -index 000000000..e7cf7e92f +index 000000000..8a9db110d --- /dev/null +++ b/fs/bcachefs/opts.h -@@ -0,0 +1,542 @@ +@@ -0,0 +1,563 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H @@ -73065,7 +73800,11 @@ index 000000000..e7cf7e92f +#include +#include "bcachefs_format.h" + ++struct bch_fs; ++ +extern const char * const bch2_error_actions[]; ++extern const char * const bch2_fsck_fix_opts[]; ++extern const char * const bch2_version_upgrade_opts[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; +extern const char * const bch2_btree_ids[]; @@ -73123,6 +73862,11 @@ index 000000000..e7cf7e92f + BCH_OPT_FN, +}; + ++struct bch_opt_fn { ++ int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++}; ++ +/** + * x(name, shortopt, type, in mem type, mode, sb_opt) + * @@ -73154,6 +73898,18 @@ index 000000000..e7cf7e92f +#define BCACHEFS_VERBOSE_DEFAULT false +#endif + ++#define BCH_FIX_ERRORS_OPTS() \ ++ x(exit, 0) \ ++ x(yes, 1) \ ++ x(no, 2) \ ++ x(ask, 3) ++ ++enum fsck_err_opts { ++#define x(t, n) FSCK_FIX_##t, ++ BCH_FIX_ERRORS_OPTS() ++#undef x ++}; ++ +#define BCH_OPTS() \ + x(block_size, u16, \ + OPT_FS|OPT_FORMAT| \ @@ -73210,12 +73966,12 @@ index 000000000..e7cf7e92f + NULL, NULL) \ + x(compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_compression_opts), \ ++ OPT_FN(bch2_opt_compression), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_compression_opts), \ ++ OPT_FN(bch2_opt_compression), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ @@ -73374,8 +74130,8 @@ index 000000000..e7cf7e92f + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ + OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ ++ OPT_FN(bch2_opt_fix_errors), \ ++ BCH2_NO_SB_OPT, FSCK_FIX_exit, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_FS|OPT_MOUNT, \ @@ -73445,8 +74201,8 @@ index 000000000..e7cf7e92f + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ + OPT_FS|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH2_NO_SB_OPT, false, \ ++ OPT_STR(bch2_version_upgrade_opts), \ ++ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ + x(buckets_nouse, u8, \ @@ -73551,8 +74307,8 @@ index 000000000..e7cf7e92f + u64 min, max; + + const char * const *choices; -+ int (*parse)(struct bch_fs *, const char *, u64 *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); ++ ++ struct bch_opt_fn fn; + + const char *hint; + const char *help; @@ -74310,10 +75066,10 @@ index 000000000..2191423d9 +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 -index 000000000..7e1f1828a +index 000000000..4f0654ff8 --- /dev/null +++ b/fs/bcachefs/quota.c -@@ -0,0 +1,980 @@ +@@ -0,0 +1,981 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" @@ -74376,7 +75132,8 @@ index 000000000..7e1f1828a +}; + +int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (k.k->p.inode >= QTYP_NR) { + prt_printf(err, "invalid quota type (%llu >= %u)", @@ -74796,13 +75553,13 @@ index 000000000..7e1f1828a + } + + if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) -+ mq->c[Q_SPC].timer = cpu_to_le64(qdq->d_spc_timer); ++ mq->c[Q_SPC].timer = qdq->d_spc_timer; + if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) -+ mq->c[Q_SPC].warns = cpu_to_le64(qdq->d_spc_warns); ++ mq->c[Q_SPC].warns = qdq->d_spc_warns; + if (qdq && qdq->d_fieldmask & QC_INO_TIMER) -+ mq->c[Q_INO].timer = cpu_to_le64(qdq->d_ino_timer); ++ mq->c[Q_INO].timer = qdq->d_ino_timer; + if (qdq && qdq->d_fieldmask & QC_INO_WARNS) -+ mq->c[Q_INO].warns = cpu_to_le64(qdq->d_ino_warns); ++ mq->c[Q_INO].warns = qdq->d_ino_warns; + + mutex_unlock(&q->lock); + } @@ -74877,7 +75634,7 @@ index 000000000..7e1f1828a + int ret; + + ret = bch2_snapshot_tree_lookup(trans, -+ snapshot_t(c, k.k->p.snapshot)->tree, &s_t); ++ bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, + snapshot_t(c, k.k->p.snapshot)->tree); @@ -75296,10 +76053,10 @@ index 000000000..7e1f1828a +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 -index 000000000..b0f7d4ee7 +index 000000000..2f463874a --- /dev/null +++ b/fs/bcachefs/quota.h -@@ -0,0 +1,72 @@ +@@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_H +#define _BCACHEFS_QUOTA_H @@ -75307,9 +76064,11 @@ index 000000000..b0f7d4ee7 +#include "inode.h" +#include "quota_types.h" + ++enum bkey_invalid_flags; +extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + -+int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_quota ((struct bkey_ops) { \ @@ -75423,10 +76182,10 @@ index 000000000..6a136083d +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 -index 000000000..989f37a3b +index 000000000..c3d577236 --- /dev/null +++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,363 @@ +@@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -75434,6 +76193,7 @@ index 000000000..989f37a3b +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" ++#include "compress.h" +#include "disk_groups.h" +#include "errcode.h" +#include "extents.h" @@ -75474,7 +76234,7 @@ index 000000000..989f37a3b + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (!p.ptr.cached && + p.crc.compression_type != -+ bch2_compression_opt_to_type[io_opts->background_compression]) ++ bch2_compression_opt_to_type(io_opts->background_compression)) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } @@ -75858,10 +76618,10 @@ index 000000000..7462a92e9 +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 -index 000000000..b86442c7c +index 000000000..63b385d88 --- /dev/null +++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1653 @@ +@@ -0,0 +1,1669 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -76172,7 +76932,7 @@ index 000000000..b86442c7c + } +} + -+struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) ++static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + struct journal_key *k = iter->keys->d + iter->idx; + @@ -76458,10 +77218,21 @@ index 000000000..b86442c7c + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; ++ unsigned update_flags = BTREE_TRIGGER_NORUN; + int ret; + ++ /* ++ * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to ++ * keep the key cache coherent with the underlying btree. Nothing ++ * besides the allocator is doing updates yet so we don't need key cache ++ * coherency for non-alloc btrees, and key cache fills for snapshots ++ * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until ++ * the snapshots recovery pass runs. ++ */ + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED; ++ else ++ update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, @@ -76474,7 +77245,7 @@ index 000000000..b86442c7c + if (k->overwritten) + goto out; + -+ ret = bch2_trans_update(trans, &iter, k->k, BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_update(trans, &iter, k->k, update_flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; @@ -76488,11 +77259,13 @@ index 000000000..b86442c7c + return cmp_int(l->journal_seq, r->journal_seq); +} + -+static int bch2_journal_replay(struct bch_fs *c, u64 start_seq, u64 end_seq) ++static int bch2_journal_replay(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; ++ u64 start_seq = c->journal_replay_seq_start; ++ u64 end_seq = c->journal_replay_seq_start; + size_t i; + int ret; + @@ -76566,13 +77339,13 @@ index 000000000..b86442c7c + case BCH_JSET_ENTRY_btree_root: { + struct btree_root *r; + -+ if (entry->btree_id >= BTREE_ID_NR) { -+ bch_err(c, "filesystem has unknown btree type %u", -+ entry->btree_id); -+ return -EINVAL; ++ while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { ++ ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); ++ if (ret) ++ return ret; + } + -+ r = &c->btree_roots[entry->btree_id]; ++ r = bch2_btree_id_root(c, entry->btree_id); + + if (entry->u64s) { + r->level = entry->level; @@ -76844,8 +77617,8 @@ index 000000000..b86442c7c + unsigned i; + int ret = 0; + -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = &c->btree_roots[i]; ++ for (i = 0; i < btree_id_nr_alive(c); i++) { ++ struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; @@ -76878,7 +77651,7 @@ index 000000000..b86442c7c + } + + for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = &c->btree_roots[i]; ++ struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->b) { + r->alive = false; @@ -76890,7 +77663,7 @@ index 000000000..b86442c7c + return ret; +} + -+static int bch2_fs_initialize_subvolumes(struct bch_fs *c) ++static int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; @@ -76906,7 +77679,7 @@ index 000000000..b86442c7c + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; -+ root_snapshot.v.subvol = BCACHEFS_ROOT_SUBVOL; ++ root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + @@ -76971,6 +77744,165 @@ index 000000000..b86442c7c + return ret; +} + ++static void check_version_upgrade(struct bch_fs *c) ++{ ++ unsigned latest_compatible = bch2_version_compatible(c->sb.version); ++ unsigned latest_version = bcachefs_metadata_version_current; ++ unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; ++ unsigned new_version = 0; ++ u64 recovery_passes; ++ ++ if (old_version < bcachefs_metadata_required_upgrade_below) { ++ if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || ++ latest_compatible < bcachefs_metadata_required_upgrade_below) ++ new_version = latest_version; ++ else ++ new_version = latest_compatible; ++ } else { ++ switch (c->opts.version_upgrade) { ++ case BCH_VERSION_UPGRADE_compatible: ++ new_version = latest_compatible; ++ break; ++ case BCH_VERSION_UPGRADE_incompatible: ++ new_version = latest_version; ++ break; ++ case BCH_VERSION_UPGRADE_none: ++ new_version = old_version; ++ break; ++ } ++ } ++ ++ if (new_version > old_version) { ++ struct printbuf buf = PRINTBUF; ++ ++ if (old_version < bcachefs_metadata_required_upgrade_below) ++ prt_str(&buf, "Version upgrade required:\n"); ++ ++ if (old_version != c->sb.version) { ++ prt_str(&buf, "Version upgrade from "); ++ bch2_version_to_text(&buf, c->sb.version_upgrade_complete); ++ prt_str(&buf, " to "); ++ bch2_version_to_text(&buf, c->sb.version); ++ prt_str(&buf, " incomplete\n"); ++ } ++ ++ prt_printf(&buf, "Doing %s version upgrade from ", ++ BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) ++ ? "incompatible" : "compatible"); ++ bch2_version_to_text(&buf, old_version); ++ prt_str(&buf, " to "); ++ bch2_version_to_text(&buf, new_version); ++ prt_newline(&buf); ++ ++ recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); ++ if (recovery_passes) { ++ prt_str(&buf, "fsck required"); ++ ++ c->recovery_passes_explicit |= recovery_passes; ++ c->opts.fix_errors = FSCK_FIX_yes; ++ } ++ ++ bch_info(c, "%s", buf.buf); ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_upgrade(c, new_version); ++ mutex_unlock(&c->sb_lock); ++ ++ printbuf_exit(&buf); ++ } ++} ++ ++static int bch2_check_allocations(struct bch_fs *c) ++{ ++ return bch2_gc(c, true, c->opts.norecovery); ++} ++ ++static int bch2_set_may_go_rw(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_MAY_GO_RW, &c->flags); ++ return 0; ++} ++ ++struct recovery_pass_fn { ++ int (*fn)(struct bch_fs *); ++ const char *name; ++ unsigned when; ++}; ++ ++static struct recovery_pass_fn recovery_passes[] = { ++#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, ++ BCH_RECOVERY_PASSES() ++#undef x ++}; ++ ++u64 bch2_fsck_recovery_passes(void) ++{ ++ u64 ret = 0; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) ++ if (recovery_passes[i].when & PASS_FSCK) ++ ret |= BIT_ULL(i); ++ return ret; ++} ++ ++static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) ++{ ++ struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; ++ ++ if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) ++ return false; ++ if (c->recovery_passes_explicit & BIT_ULL(pass)) ++ return true; ++ if ((p->when & PASS_FSCK) && c->opts.fsck) ++ return true; ++ if ((p->when & PASS_UNCLEAN) && !c->sb.clean) ++ return true; ++ if (p->when & PASS_ALWAYS) ++ return true; ++ return false; ++} ++ ++static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) ++{ ++ int ret; ++ ++ c->curr_recovery_pass = pass; ++ ++ if (should_run_recovery_pass(c, pass)) { ++ struct recovery_pass_fn *p = recovery_passes + pass; ++ ++ if (!(p->when & PASS_SILENT)) ++ printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); ++ ret = p->fn(c); ++ if (ret) ++ return ret; ++ if (!(p->when & PASS_SILENT)) ++ printk(KERN_CONT " done\n"); ++ } ++ ++ return 0; ++} ++ ++static int bch2_run_recovery_passes(struct bch_fs *c) ++{ ++ int ret = 0; ++again: ++ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { ++ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); ++ if (ret) ++ break; ++ c->curr_recovery_pass++; ++ } ++ ++ if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ c->curr_recovery_pass = BCH_RECOVERY_PASS_delete_dead_snapshots; ++ goto again; ++ } ++ ++ return ret; ++} ++ +int bch2_fs_recovery(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean = NULL; @@ -77010,23 +77942,8 @@ index 000000000..b86442c7c + goto err; + } + -+ if (!c->opts.nochanges && -+ c->sb.version < bcachefs_metadata_required_upgrade_below) { -+ struct printbuf buf = PRINTBUF; -+ -+ prt_str(&buf, "version "); -+ bch2_version_to_text(&buf, c->sb.version); -+ prt_str(&buf, " prior to "); -+ bch2_version_to_text(&buf, bcachefs_metadata_required_upgrade_below); -+ prt_str(&buf, ", upgrade and fsck required"); -+ -+ bch_info(c, "%s", buf.buf); -+ printbuf_exit(&buf); -+ -+ c->opts.version_upgrade = true; -+ c->opts.fsck = true; -+ c->opts.fix_errors = FSCK_OPT_YES; -+ } ++ if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) ++ check_version_upgrade(c); + + if (c->opts.fsck && c->opts.norecovery) { + bch_err(c, "cannot select both norecovery and fsck"); @@ -77105,6 +78022,9 @@ index 000000000..b86442c7c + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + ++ c->journal_replay_seq_start = last_seq; ++ c->journal_replay_seq_end = blacklist_seq - 1;; ++ + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); @@ -77157,146 +78077,9 @@ index 000000000..b86442c7c + if (ret) + goto err; + -+ bch_verbose(c, "starting alloc read"); -+ down_read(&c->gc_lock); -+ ret = c->sb.version < bcachefs_metadata_version_bucket_gens -+ ? bch2_alloc_read(c) -+ : bch2_bucket_gens_read(c); -+ up_read(&c->gc_lock); ++ ret = bch2_run_recovery_passes(c); + if (ret) + goto err; -+ bch_verbose(c, "alloc read done"); -+ -+ bch_verbose(c, "starting stripes_read"); -+ ret = bch2_stripes_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "stripes_read done"); -+ -+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { -+ ret = bch2_fs_initialize_subvolumes(c); -+ if (ret) -+ goto err; -+ } -+ -+ bch_verbose(c, "reading snapshots table"); -+ ret = bch2_fs_snapshots_start(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "reading snapshots done"); -+ -+ if (c->opts.fsck) { -+ bool metadata_only = c->opts.norecovery; -+ -+ bch_info(c, "checking allocations"); -+ ret = bch2_gc(c, true, metadata_only); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking allocations"); -+ -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); -+ -+ bch_info(c, "starting journal replay, %zu keys", c->journal_keys.nr); -+ ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1); -+ if (ret) -+ goto err; -+ if (c->opts.verbose || !c->sb.clean) -+ bch_info(c, "journal replay done"); -+ -+ bch_info(c, "checking need_discard and freespace btrees"); -+ ret = bch2_check_alloc_info(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking need_discard and freespace btrees"); -+ -+ set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags); -+ -+ bch_info(c, "checking lrus"); -+ ret = bch2_check_lrus(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking lrus"); -+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); -+ -+ bch_info(c, "checking backpointers to alloc keys"); -+ ret = bch2_check_btree_backpointers(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking backpointers to alloc keys"); -+ -+ bch_info(c, "checking backpointers to extents"); -+ ret = bch2_check_backpointers_to_extents(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking backpointers to extents"); -+ -+ bch_info(c, "checking extents to backpointers"); -+ ret = bch2_check_extents_to_backpointers(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking extents to backpointers"); -+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); -+ -+ bch_info(c, "checking alloc to lru refs"); -+ ret = bch2_check_alloc_to_lru_refs(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "done checking alloc to lru refs"); -+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); -+ } else { -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_ALLOC_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ -+ if (c->opts.norecovery) -+ goto out; -+ -+ set_bit(BCH_FS_MAY_GO_RW, &c->flags); -+ -+ bch_verbose(c, "starting journal replay, %zu keys", c->journal_keys.nr); -+ ret = bch2_journal_replay(c, last_seq, blacklist_seq - 1); -+ if (ret) -+ goto err; -+ if (c->opts.verbose || !c->sb.clean) -+ bch_info(c, "journal replay done"); -+ } -+ -+ ret = bch2_fs_freespace_init(c); -+ if (ret) -+ goto err; -+ -+ if (c->sb.version < bcachefs_metadata_version_bucket_gens && -+ c->opts.version_upgrade) { -+ bch_info(c, "initializing bucket_gens"); -+ ret = bch2_bucket_gens_init(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "bucket_gens init done"); -+ } -+ -+ if (c->sb.version < bcachefs_metadata_version_snapshot_2) { -+ ret = bch2_fs_upgrade_for_subvolumes(c); -+ if (ret) -+ goto err; -+ } -+ -+ if (c->opts.fsck) { -+ ret = bch2_fsck_full(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "fsck done"); -+ } else if (!c->sb.clean) { -+ bch_verbose(c, "checking for deleted inodes"); -+ ret = bch2_fsck_walk_inodes_only(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ } + + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas"); @@ -77307,9 +78090,8 @@ index 000000000..b86442c7c + } + + mutex_lock(&c->sb_lock); -+ if (c->opts.version_upgrade) { -+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { ++ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); + write_sb = true; + } + @@ -77332,7 +78114,7 @@ index 000000000..b86442c7c + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || -+ le16_to_cpu(c->sb.version_min) < bcachefs_metadata_version_btree_ptr_sectors_written) { ++ c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch2_move_stats_init(&stats, "recovery"); @@ -77368,8 +78150,6 @@ index 000000000..b86442c7c + + if (ret) + bch_err_fn(c, ret); -+ else -+ bch_verbose(c, "ret %s", bch2_err_str(ret)); + return ret; +err: +fsck_err: @@ -77392,20 +78172,16 @@ index 000000000..b86442c7c + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + -+ if (c->sb.version < bcachefs_metadata_version_inode_v3) -+ c->opts.version_upgrade = true; ++ bch2_sb_maybe_downgrade(c); + -+ if (c->opts.version_upgrade) { -+ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++ if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { ++ bch2_sb_upgrade(c, bcachefs_metadata_version_current); ++ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_LRUS_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_BACKPOINTERS_DONE, &c->flags); -+ set_bit(BCH_FS_CHECK_ALLOC_TO_LRU_REFS_DONE, &c->flags); ++ c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + @@ -77453,12 +78229,12 @@ index 000000000..b86442c7c + if (ret) + goto err; + -+ ret = bch2_fs_initialize_subvolumes(c); ++ ret = bch2_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); -+ ret = bch2_fs_snapshots_start(c); ++ ret = bch2_snapshots_read(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); @@ -77517,10 +78293,10 @@ index 000000000..b86442c7c +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 -index 000000000..8c0348e8b +index 000000000..f8e796c0f --- /dev/null +++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,58 @@ +@@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H @@ -77575,16 +78351,18 @@ index 000000000..8c0348e8b +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct bch_fs *); + ++u64 bch2_fsck_recovery_passes(void); ++ +int bch2_fs_recovery(struct bch_fs *); +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 -index 000000000..537d84b61 +index 000000000..39f711d50 --- /dev/null +++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,388 @@ +@@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" @@ -77613,7 +78391,8 @@ index 000000000..537d84b61 +/* reflink pointers */ + +int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + @@ -77659,7 +78438,8 @@ index 000000000..537d84b61 +/* indirect extents */ + +int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} @@ -77682,22 +78462,21 @@ index 000000000..537d84b61 + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} + -+static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags) -+{ -+ if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) { -+ new->k.type = KEY_TYPE_deleted; -+ new->k.size = 0; -+ set_bkey_val_u64s(&new->k, 0);; -+ *flags &= ~BTREE_TRIGGER_INSERT; -+ } -+} -+ +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ -+ check_indirect_extent_deleting(new, &flags); ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ return 0; ++ } ++ } + + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); +} @@ -77705,13 +78484,14 @@ index 000000000..537d84b61 +/* indirect inline data */ + +int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bkey_s_c k) ++ struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); @@ -77726,7 +78506,16 @@ index 000000000..537d84b61 + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ -+ check_indirect_extent_deleting(new, &flags); ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ struct bkey_i_indirect_inline_data *r = ++ bkey_i_to_indirect_inline_data(new); ++ ++ if (!r->v.refcount) { ++ r->k.type = KEY_TYPE_deleted; ++ r->k.size = 0; ++ set_bkey_val_u64s(&r->k, 0); ++ } ++ } + + return 0; +} @@ -77975,16 +78764,18 @@ index 000000000..537d84b61 +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 -index 000000000..ba400188f +index 000000000..fe52538ef --- /dev/null +++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,79 @@ +@@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + ++enum bkey_invalid_flags; ++ +int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); @@ -77999,7 +78790,7 @@ index 000000000..ba400188f +}) + +int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, @@ -78015,7 +78806,7 @@ index 000000000..ba400188f +}) + +int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_indirect_inline_data(struct btree_trans *, @@ -78060,10 +78851,10 @@ index 000000000..ba400188f +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 -index 000000000..76efbfce7 +index 000000000..5b591c59b --- /dev/null +++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1056 @@ +@@ -0,0 +1,1059 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -78102,8 +78893,8 @@ index 000000000..76efbfce7 + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + -+void bch2_replicas_entry_v0_to_text(struct printbuf *out, -+ struct bch_replicas_entry_v0 *e) ++static void bch2_replicas_entry_v0_to_text(struct printbuf *out, ++ struct bch_replicas_entry_v0 *e) +{ + unsigned i; + @@ -78338,7 +79129,7 @@ index 000000000..76efbfce7 +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) -+ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); + + preempt_disable(); + dst = this_cpu_ptr(dst_p); @@ -78528,6 +79319,9 @@ index 000000000..76efbfce7 +{ + lockdep_assert_held(&c->replicas_gc_lock); + ++ if (ret) ++ goto err; ++ + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + @@ -79954,10 +80748,10 @@ index 000000000..ae21a8cca +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 -index 000000000..89c7c83c5 +index 000000000..7e6b416d3 --- /dev/null +++ b/fs/bcachefs/subvolume.c -@@ -0,0 +1,1505 @@ +@@ -0,0 +1,1734 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" @@ -79968,8 +80762,107 @@ index 000000000..89c7c83c5 +#include "fs.h" +#include "subvolume.h" + ++#include ++ +static int bch2_subvolume_delete(struct btree_trans *, u32); + ++static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) ++{ ++ const struct snapshot_t *s = __snapshot_t(t, id); ++ ++ if (s->skip[2] <= ancestor) ++ return s->skip[2]; ++ if (s->skip[1] <= ancestor) ++ return s->skip[1]; ++ if (s->skip[0] <= ancestor) ++ return s->skip[0]; ++ return s->parent; ++} ++ ++bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ struct snapshot_table *t; ++ bool ret; ++ ++ EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); ++ ++ rcu_read_lock(); ++ t = rcu_dereference(c->snapshots); ++ ++ while (id && id < ancestor - IS_ANCESTOR_BITMAP) ++ id = get_ancestor_below(t, id, ancestor); ++ ++ ret = id && id < ancestor ++ ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) ++ : id == ancestor; ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) ++{ ++ struct snapshot_table *t; ++ ++ rcu_read_lock(); ++ t = rcu_dereference(c->snapshots); ++ ++ while (id && id < ancestor) ++ id = __snapshot_t(t, id)->parent; ++ rcu_read_unlock(); ++ ++ return id == ancestor; ++} ++ ++static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) ++{ ++ u32 depth; ++ ++ rcu_read_lock(); ++ depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; ++ rcu_read_unlock(); ++ ++ return depth; ++} ++ ++static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) ++{ ++ size_t idx = U32_MAX - id; ++ size_t new_size; ++ struct snapshot_table *new, *old; ++ ++ new_size = max(16UL, roundup_pow_of_two(idx + 1)); ++ ++ new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); ++ if (!new) ++ return NULL; ++ ++ old = c->snapshots; ++ if (old) ++ memcpy(new->s, ++ rcu_dereference_protected(c->snapshots, true)->s, ++ sizeof(new->s[0]) * c->snapshot_table_size); ++ ++ rcu_assign_pointer(c->snapshots, new); ++ c->snapshot_table_size = new_size; ++ if (old) ++ kvfree_rcu(old); ++ ++ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; ++} ++ ++static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) ++{ ++ size_t idx = U32_MAX - id; ++ ++ lockdep_assert_held(&c->snapshot_table_lock); ++ ++ if (likely(idx < c->snapshot_table_size)) ++ return &rcu_dereference_protected(c->snapshots, true)->s[idx]; ++ ++ return __snapshot_t_mut(c, id); ++} ++ +/* Snapshot tree: */ + +void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, @@ -79983,7 +80876,8 @@ index 000000000..89c7c83c5 +} + +int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + if (bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1))) { @@ -80057,7 +80951,8 @@ index 000000000..89c7c83c5 +} + +int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + struct bkey_s_c_snapshot s; + u32 i, id; @@ -80098,6 +80993,25 @@ index 000000000..89c7c83c5 + } + } + ++ if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { ++ if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || ++ le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { ++ prt_printf(err, "skiplist not normalized"); ++ return -BCH_ERR_invalid_bkey; ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { ++ id = le32_to_cpu(s.v->skip[i]); ++ ++ if (!id != !s.v->parent || ++ (s.v->parent && ++ id <= k.k->p.offset)) { ++ prt_printf(err, "bad skiplist node %u)", id); ++ return -BCH_ERR_invalid_bkey; ++ } ++ } ++ } ++ + return 0; +} + @@ -80108,30 +81022,43 @@ index 000000000..89c7c83c5 +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; ++ u32 id = new.k->p.offset; ++ int ret = 0; + -+ t = genradix_ptr_alloc(&c->snapshots, -+ U32_MAX - new.k->p.offset, -+ GFP_KERNEL); -+ if (!t) -+ return -BCH_ERR_ENOMEM_mark_snapshot; ++ mutex_lock(&c->snapshot_table_lock); ++ ++ t = snapshot_t_mut(c, id); ++ if (!t) { ++ ret = -BCH_ERR_ENOMEM_mark_snapshot; ++ goto err; ++ } + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); ++ u32 parent = id; + + t->parent = le32_to_cpu(s.v->parent); ++ t->skip[0] = le32_to_cpu(s.v->skip[0]); ++ t->skip[1] = le32_to_cpu(s.v->skip[1]); ++ t->skip[2] = le32_to_cpu(s.v->skip[2]); ++ t->depth = le32_to_cpu(s.v->depth); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + t->tree = le32_to_cpu(s.v->tree); -+ } else { -+ t->parent = 0; -+ t->children[0] = 0; -+ t->children[1] = 0; -+ t->subvol = 0; -+ t->tree = 0; -+ } + -+ return 0; ++ while ((parent = bch2_snapshot_parent_early(c, parent)) && ++ parent - id - 1 < IS_ANCESTOR_BITMAP) ++ __set_bit(parent - id - 1, t->is_ancestor); ++ ++ if (BCH_SNAPSHOT_DELETED(s.v)) ++ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ } else { ++ memset(t, 0, sizeof(*t)); ++ } ++err: ++ mutex_unlock(&c->snapshot_table_lock); ++ return ret; +} + +static int snapshot_lookup(struct btree_trans *trans, u32 id, @@ -80184,9 +81111,14 @@ index 000000000..89c7c83c5 + nr_live += ret; + } + -+ snapshot_t(c, id)->equiv = nr_live == 1 -+ ? snapshot_t(c, child[live_idx])->equiv ++ mutex_lock(&c->snapshot_table_lock); ++ ++ snapshot_t_mut(c, id)->equiv = nr_live == 1 ++ ? snapshot_t_mut(c, child[live_idx])->equiv + : id; ++ ++ mutex_unlock(&c->snapshot_table_lock); ++ + return 0; +} + @@ -80328,9 +81260,9 @@ index 000000000..89c7c83c5 + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || -+ fsck_err_on(!bch2_snapshot_is_ancestor(c, -+ le32_to_cpu(subvol.snapshot), -+ root_id), c, ++ fsck_err_on(!bch2_snapshot_is_ancestor_early(c, ++ le32_to_cpu(subvol.snapshot), ++ root_id), c, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || @@ -80366,7 +81298,7 @@ index 000000000..89c7c83c5 + * And, make sure it points to a subvolume within that snapshot tree, or correct + * it to point to the oldest subvolume within that snapshot tree. + */ -+int bch2_fs_check_snapshot_trees(struct bch_fs *c) ++int bch2_check_snapshot_trees(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -80399,7 +81331,49 @@ index 000000000..89c7c83c5 + if (ret) + return ret; + -+ return bch2_snapshot_is_ancestor(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); ++ return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); ++} ++ ++static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s; ++ ++ if (!id) ++ return 0; ++ ++ rcu_read_lock(); ++ s = snapshot_t(c, id); ++ if (s->parent) ++ id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) ++{ ++ struct bch_snapshot a; ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < 3; i++) { ++ if (!s.parent != !s.skip[i]) ++ return false; ++ ++ if (!s.parent) ++ continue; ++ ++ ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); ++ if (bch2_err_matches(ret, ENOENT)) ++ return false; ++ if (ret) ++ return ret; ++ ++ if (a.tree != s.tree) ++ return false; ++ } ++ ++ return true; +} + +/* @@ -80409,14 +81383,15 @@ index 000000000..89c7c83c5 + */ +static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct btree_iter *iter, -+ struct bkey_s_c_snapshot *s) ++ struct bkey_s_c k, ++ struct bch_snapshot *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter root_iter; + struct bch_snapshot_tree s_t; + struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; -+ u32 root_id = bch2_snapshot_root(c, s->k->p.offset), tree_id; ++ u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; + int ret; + + root = bch2_bkey_get_iter_typed(trans, &root_iter, @@ -80442,18 +81417,18 @@ index 000000000..89c7c83c5 + goto err; + + u->v.tree = cpu_to_le32(tree_id); -+ if (s->k->p.snapshot == root_id) -+ *s = snapshot_i_to_s_c(u); ++ if (k.k->p.offset == root_id) ++ *s = u->v; + } + -+ if (s->k->p.snapshot != root_id) { -+ u = bch2_bkey_make_mut_typed(trans, iter, &s->s_c, 0, snapshot); ++ if (k.k->p.offset != root_id) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); -+ *s = snapshot_i_to_s_c(u); ++ *s = u->v; + } +err: + bch2_trans_iter_exit(trans, &root_iter); @@ -80465,9 +81440,12 @@ index 000000000..89c7c83c5 + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; -+ struct bkey_s_c_snapshot s; ++ struct bch_snapshot s; + struct bch_subvolume subvol; + struct bch_snapshot v; ++ struct bkey_i_snapshot *u; ++ u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); ++ u32 real_depth; + struct printbuf buf = PRINTBUF; + bool should_have_subvol; + u32 i, id; @@ -80476,109 +81454,142 @@ index 000000000..89c7c83c5 + if (k.k->type != KEY_TYPE_snapshot) + return 0; + -+ s = bkey_s_c_to_snapshot(k); -+ id = le32_to_cpu(s.v->parent); ++ memset(&s, 0, sizeof(s)); ++ memcpy(&s, k.v, bkey_val_bytes(k.k)); ++ ++ id = le32_to_cpu(s.parent); + if (id) { + ret = snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot with nonexistent parent:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + -+ if (le32_to_cpu(v.children[0]) != s.k->p.offset && -+ le32_to_cpu(v.children[1]) != s.k->p.offset) { ++ if (le32_to_cpu(v.children[0]) != k.k->p.offset && ++ le32_to_cpu(v.children[1]) != k.k->p.offset) { + bch_err(c, "snapshot parent %u missing pointer to child %llu", -+ id, s.k->p.offset); ++ id, k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + -+ for (i = 0; i < 2 && s.v->children[i]; i++) { -+ id = le32_to_cpu(s.v->children[i]); ++ for (i = 0; i < 2 && s.children[i]; i++) { ++ id = le32_to_cpu(s.children[i]); + + ret = snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot node %llu has nonexistent child %u", -+ s.k->p.offset, id); ++ k.k->p.offset, id); + if (ret) + goto err; + -+ if (le32_to_cpu(v.parent) != s.k->p.offset) { ++ if (le32_to_cpu(v.parent) != k.k->p.offset) { + bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", -+ id, le32_to_cpu(v.parent), s.k->p.offset); ++ id, le32_to_cpu(v.parent), k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + -+ should_have_subvol = BCH_SNAPSHOT_SUBVOL(s.v) && -+ !BCH_SNAPSHOT_DELETED(s.v); ++ should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && ++ !BCH_SNAPSHOT_DELETED(&s); + + if (should_have_subvol) { -+ id = le32_to_cpu(s.v->subvol); ++ id = le32_to_cpu(s.subvol); + ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf)); ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + -+ if (BCH_SNAPSHOT_SUBVOL(s.v) != (le32_to_cpu(subvol.snapshot) == s.k->p.offset)) { ++ if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { + bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", -+ s.k->p.offset); ++ k.k->p.offset); + ret = -EINVAL; + goto err; + } + } else { -+ if (fsck_err_on(s.v->subvol, c, "snapshot should not point to subvol:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { -+ struct bkey_i_snapshot *u = bch2_trans_kmalloc(trans, sizeof(*u)); -+ ++ if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + -+ bkey_reassemble(&u->k_i, s.s_c); + u->v.subvol = 0; -+ ret = bch2_trans_update(trans, iter, &u->k_i, 0); -+ if (ret) -+ goto err; -+ -+ s = snapshot_i_to_s_c(u); ++ s = u->v; + } + } + -+ ret = snapshot_tree_ptr_good(trans, s.k->p.offset, le32_to_cpu(s.v->tree)); ++ ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", -+ (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { -+ ret = snapshot_tree_ptr_repair(trans, iter, &s); ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) + goto err; + } + ret = 0; + -+ if (BCH_SNAPSHOT_DELETED(s.v)) -+ set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); ++ real_depth = bch2_snapshot_depth(c, parent_id); ++ ++ if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, c, ++ "snapshot with incorrect depth fields, should be %u:\n %s", ++ real_depth, ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ u->v.depth = cpu_to_le32(real_depth); ++ s = u->v; ++ } ++ ++ ret = snapshot_skiplist_good(trans, s); ++ if (ret < 0) ++ goto err; ++ ++ if (!ret && ++ (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || ++ fsck_err(c, "snapshot with bad skiplist field:\n %s", ++ (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { ++ u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ goto err; ++ ++ for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) ++ u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); ++ ++ bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int); ++ s = u->v; ++ } ++ ret = 0; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + -+int bch2_fs_check_snapshots(struct bch_fs *c) ++int bch2_check_snapshots(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + ++ /* ++ * We iterate backwards as checking/fixing the depth field requires that ++ * the parent's depth already be correct: ++ */ + ret = bch2_trans_run(c, -+ for_each_btree_key_commit(&trans, iter, -+ BTREE_ID_snapshots, POS_MIN, ++ for_each_btree_key_reverse_commit(&trans, iter, ++ BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot(&trans, &iter, k))); @@ -80622,9 +81633,13 @@ index 000000000..89c7c83c5 + + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { + u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); -+ u32 snapshot_tree = snapshot_t(c, snapshot_root)->tree; ++ u32 snapshot_tree; + struct bch_snapshot_tree st; + ++ rcu_read_lock(); ++ snapshot_tree = snapshot_t(c, snapshot_root)->tree; ++ rcu_read_unlock(); ++ + ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, @@ -80650,7 +81665,7 @@ index 000000000..89c7c83c5 + return ret; +} + -+int bch2_fs_check_subvols(struct bch_fs *c) ++int bch2_check_subvols(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -80668,10 +81683,10 @@ index 000000000..89c7c83c5 + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ -+ genradix_free(&c->snapshots); ++ kfree(c->snapshots); +} + -+int bch2_fs_snapshots_start(struct bch_fs *c) ++int bch2_snapshots_read(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; @@ -80785,7 +81800,7 @@ index 000000000..89c7c83c5 + goto err; + + if (s.v->children[0]) { -+ s_t->v.root_snapshot = cpu_to_le32(s.v->children[0]); ++ s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); @@ -80805,10 +81820,12 @@ index 000000000..89c7c83c5 + u32 *snapshot_subvols, + unsigned nr_snapids) +{ ++ struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; -+ unsigned i; ++ unsigned i, j; ++ u32 depth = bch2_snapshot_depth(c, parent); + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, @@ -80838,6 +81855,12 @@ index 000000000..89c7c83c5 + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.tree = cpu_to_le32(tree); ++ n->v.depth = cpu_to_le32(depth); ++ ++ for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) ++ n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); ++ ++ bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, @@ -80941,7 +81964,7 @@ index 000000000..89c7c83c5 + struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; -+ u32 equiv = snapshot_t(c, k.k->p.snapshot)->equiv; ++ u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (!bkey_eq(k.k->p, *last_pos)) + equiv_seen->nr = 0; @@ -81109,7 +82132,7 @@ index 000000000..89c7c83c5 + + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + -+ if (!test_bit(BCH_FS_FSCK_DONE, &c->flags)) ++ if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) + return 0; + + bch2_delete_dead_snapshots_async(c); @@ -81288,7 +82311,7 @@ index 000000000..89c7c83c5 + __bch2_subvolume_delete(trans, subvolid)); +} + -+void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) ++static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); @@ -81326,7 +82349,7 @@ index 000000000..89c7c83c5 + u32 subvol; +}; + -+int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, ++static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); @@ -81465,10 +82488,10 @@ index 000000000..89c7c83c5 +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 -index 000000000..1a39f713d +index 000000000..12a08a34e --- /dev/null +++ b/fs/bcachefs/subvolume.h -@@ -0,0 +1,167 @@ +@@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H @@ -81476,9 +82499,11 @@ index 000000000..1a39f713d +#include "darray.h" +#include "subvolume_types.h" + ++enum bkey_invalid_flags; ++ +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_tree_invalid, \ @@ -81490,7 +82515,7 @@ index 000000000..1a39f713d + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, -+ unsigned, struct printbuf *); ++ enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + @@ -81501,51 +82526,133 @@ index 000000000..1a39f713d + .min_val_size = 24, \ +}) + -+static inline struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ++static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) +{ -+ return genradix_ptr(&c->snapshots, U32_MAX - id); ++ return &t->s[U32_MAX - id]; ++} ++ ++static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) ++{ ++ return __snapshot_t(rcu_dereference(c->snapshots), id); ++} ++ ++static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = snapshot_t(c, id)->tree; ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) ++{ ++ return snapshot_t(c, id)->parent; ++} ++ ++static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) ++{ ++ rcu_read_lock(); ++ id = __bch2_snapshot_parent_early(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ u32 parent = snapshot_t(c, id)->parent; ++ ++ if (parent && ++ snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) ++ panic("id %u depth=%u parent %u depth=%u\n", ++ id, snapshot_t(c, id)->depth, ++ parent, snapshot_t(c, parent)->depth); ++ ++ return parent; ++#else ++ return snapshot_t(c, id)->parent; ++#endif +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ -+ return snapshot_t(c, id)->parent; ++ rcu_read_lock(); ++ id = __bch2_snapshot_parent(c, id); ++ rcu_read_unlock(); ++ ++ return id; ++} ++ ++static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) ++{ ++ rcu_read_lock(); ++ while (n--) ++ id = __bch2_snapshot_parent(c, id); ++ rcu_read_unlock(); ++ ++ return id; +} + +static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +{ + u32 parent; + -+ while ((parent = bch2_snapshot_parent(c, id))) ++ rcu_read_lock(); ++ while ((parent = __bch2_snapshot_parent(c, id))) + id = parent; ++ rcu_read_unlock(); ++ + return id; +} + -+static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) ++static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->equiv; +} + -+static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) ++static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ -+ return id == snapshot_t(c, id)->equiv; ++ rcu_read_lock(); ++ id = __bch2_snapshot_equiv(c, id); ++ rcu_read_unlock(); ++ ++ return id; +} + -+static inline u32 bch2_snapshot_internal_node(struct bch_fs *c, u32 id) ++static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) +{ -+ struct snapshot_t *s = snapshot_t(c, id); ++ return id == bch2_snapshot_equiv(c, id); ++} + -+ return s->children[0] || s->children[1]; ++static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) ++{ ++ const struct snapshot_t *s; ++ bool ret; ++ ++ rcu_read_lock(); ++ s = snapshot_t(c, id); ++ ret = s->children[0]; ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) ++{ ++ return !bch2_snapshot_is_internal_node(c, id); +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ -+ struct snapshot_t *s; -+ u32 parent = bch2_snapshot_parent(c, id); ++ const struct snapshot_t *s; ++ u32 parent = __bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + -+ s = snapshot_t(c, bch2_snapshot_parent(c, id)); ++ s = snapshot_t(c, __bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) @@ -81553,19 +82660,19 @@ index 000000000..1a39f713d + return 0; +} + -+static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) -+{ -+ while (id && id < ancestor) -+ id = bch2_snapshot_parent(c, id); -+ -+ return id == ancestor; -+} ++bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); + +static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) +{ -+ struct snapshot_t *t = snapshot_t(c, id); ++ const struct snapshot_t *t; ++ bool ret; + -+ return (t->children[0]|t->children[1]) != 0; ++ rcu_read_lock(); ++ t = snapshot_t(c, id); ++ ret = (t->children[0]|t->children[1]) != 0; ++ rcu_read_unlock(); ++ ++ return ret; +} + +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) @@ -81599,12 +82706,12 @@ index 000000000..1a39f713d + return ret; +} + -+int bch2_fs_check_snapshot_trees(struct bch_fs *); -+int bch2_fs_check_snapshots(struct bch_fs *); -+int bch2_fs_check_subvols(struct bch_fs *); ++int bch2_check_snapshot_trees(struct bch_fs *); ++int bch2_check_snapshots(struct bch_fs *); ++int bch2_check_subvols(struct bch_fs *); + +void bch2_fs_snapshots_exit(struct bch_fs *); -+int bch2_fs_snapshots_start(struct bch_fs *); ++int bch2_snapshots_read(struct bch_fs *); + +int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, + unsigned, struct printbuf *); @@ -81638,10 +82745,10 @@ index 000000000..1a39f713d +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 -index 000000000..c6c1cbad9 +index 000000000..86833445a --- /dev/null +++ b/fs/bcachefs/subvolume_types.h -@@ -0,0 +1,22 @@ +@@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H @@ -81650,12 +82757,21 @@ index 000000000..c6c1cbad9 + +typedef DARRAY(u32) snapshot_id_list; + ++#define IS_ANCESTOR_BITMAP 128 ++ +struct snapshot_t { + u32 parent; ++ u32 skip[3]; ++ u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + u32 equiv; ++ unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; ++}; ++ ++struct snapshot_table { ++ struct snapshot_t s[0]; +}; + +typedef struct { @@ -81666,16 +82782,17 @@ index 000000000..c6c1cbad9 +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 -index 000000000..481f9de6e +index 000000000..e9ce3f332 --- /dev/null +++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1616 @@ +@@ -0,0 +1,1711 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" ++#include "counters.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" @@ -81684,30 +82801,76 @@ index 000000000..481f9de6e +#include "journal_io.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" ++#include "recovery.h" +#include "replicas.h" +#include "quota.h" +#include "super-io.h" +#include "super.h" +#include "trace.h" +#include "vstructs.h" -+#include "counters.h" + +#include +#include + -+static const char * const bch2_metadata_versions[] = { -+#define x(t, n) [n] = #t, ++struct bch2_metadata_version { ++ u16 version; ++ const char *name; ++ u64 recovery_passes; ++}; ++ ++static const struct bch2_metadata_version bch2_metadata_versions[] = { ++#define x(n, v, _recovery_passes) { \ ++ .version = v, \ ++ .name = #n, \ ++ .recovery_passes = _recovery_passes, \ ++}, + BCH_METADATA_VERSIONS() +#undef x +}; + +void bch2_version_to_text(struct printbuf *out, unsigned v) +{ -+ const char *str = v < ARRAY_SIZE(bch2_metadata_versions) -+ ? bch2_metadata_versions[v] -+ : "(unknown version)"; ++ const char *str = "(unknown version)"; + -+ prt_printf(out, "%u: %s", v, str); ++ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) ++ if (bch2_metadata_versions[i].version == v) { ++ str = bch2_metadata_versions[i].name; ++ break; ++ } ++ ++ prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); ++} ++ ++unsigned bch2_latest_compatible_version(unsigned v) ++{ ++ if (!BCH_VERSION_MAJOR(v)) ++ return v; ++ ++ for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) ++ if (bch2_metadata_versions[i].version > v && ++ BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == ++ BCH_VERSION_MAJOR(v)) ++ v = bch2_metadata_versions[i].version; ++ ++ return v; ++} ++ ++u64 bch2_upgrade_recovery_passes(struct bch_fs *c, ++ unsigned old_version, ++ unsigned new_version) ++{ ++ u64 ret = 0; ++ ++ for (const struct bch2_metadata_version *i = bch2_metadata_versions; ++ i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); ++ i++) ++ if (i->version > old_version && i->version <= new_version) { ++ if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) ++ ret |= bch2_fsck_recovery_passes(); ++ ret |= i->recovery_passes; ++ } ++ ++ return ret &= ~RECOVERY_PASS_ALL_FSCK; +} + +const char * const bch2_sb_fields[] = { @@ -82117,6 +83280,7 @@ index 000000000..481f9de6e + c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); + c->sb.version_min = le16_to_cpu(src->version_min); ++ c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src) ?: c->sb.version; + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); @@ -82282,13 +83446,18 @@ index 000000000..481f9de6e + struct printbuf err = PRINTBUF; + __le64 *i; + int ret; -+ -+ pr_verbose_init(*opts, ""); -+ ++#ifndef __KERNEL__ ++retry: ++#endif + memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; + sb->have_bio = true; + ++#ifndef __KERNEL__ ++ if (opt_get(*opts, direct_io) == false) ++ sb->mode |= FMODE_BUFFERED; ++#endif ++ + if (!opt_get(*opts, noexcl)) + sb->mode |= FMODE_EXCL; + @@ -82373,7 +83542,13 @@ index 000000000..481f9de6e + +got_super: + if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev)) { ++ bdev_logical_block_size(sb->bdev) && ++ opt_get(*opts, direct_io)) { ++#ifndef __KERNEL__ ++ opt_set(*opts, direct_io, false); ++ bch2_free_super(sb); ++ goto retry; ++#endif + prt_printf(&err, "block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); @@ -82391,7 +83566,6 @@ index 000000000..481f9de6e + goto err_no_print; + } +out: -+ pr_verbose_init(*opts, "ret %i", ret); + printbuf_exit(&err); + return ret; +err: @@ -82483,10 +83657,9 @@ index 000000000..481f9de6e + closure_init_stack(cl); + memset(&sb_written, 0, sizeof(sb_written)); + -+ if (c->opts.version_upgrade) { -+ c->disk_sb.sb->magic = BCHFS_MAGIC; -+ c->disk_sb.sb->layout.magic = BCHFS_MAGIC; -+ } ++ /* Make sure we're using the new magic numbers: */ ++ c->disk_sb.sb->magic = BCHFS_MAGIC; ++ c->disk_sb.sb->layout.magic = BCHFS_MAGIC; + + le64_add_cpu(&c->disk_sb.sb->seq, 1); + @@ -82849,6 +84022,32 @@ index 000000000..481f9de6e + return 0; +} + ++/* Downgrade if superblock is at a higher version than currently supported: */ ++void bch2_sb_maybe_downgrade(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* ++ * Downgrade, if superblock is at a higher version than currently ++ * supported: ++ */ ++ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) ++ SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); ++ if (c->sb.version > bcachefs_metadata_version_current) ++ c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); ++ if (c->sb.version_min > bcachefs_metadata_version_current) ++ c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); ++ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); ++} ++ ++void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) ++{ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->disk_sb.sb->version = cpu_to_le16(new_version); ++ c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); ++} ++ +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; @@ -82860,8 +84059,10 @@ index 000000000..481f9de6e + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_sb_maybe_downgrade(c); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); -+ c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); ++ + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + @@ -83080,21 +84281,27 @@ index 000000000..481f9de6e +#undef x +}; + ++static const struct bch_sb_field_ops bch2_sb_field_null_ops; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) ++{ ++ return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) ++ ? bch2_sb_field_ops[type] ++ : &bch2_sb_field_null_ops; ++} ++ +static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + unsigned type = le32_to_cpu(f->type); + struct printbuf field_err = PRINTBUF; ++ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + int ret; + -+ if (type >= BCH_SB_FIELD_NR) -+ return 0; -+ -+ ret = bch2_sb_field_ops[type]->validate(sb, f, &field_err); ++ ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + if (ret) { + prt_printf(err, "Invalid superblock section %s: %s", -+ bch2_sb_fields[type], -+ field_err.buf); ++ bch2_sb_fields[type], field_err.buf); + prt_newline(err); + bch2_sb_field_to_text(err, sb, f); + } @@ -83107,13 +84314,12 @@ index 000000000..481f9de6e + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); -+ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type] : NULL; ++ const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + -+ if (ops) ++ if (type < BCH_SB_FIELD_NR) + prt_printf(out, "%s", bch2_sb_fields[type]); + else + prt_printf(out, "(unknown field %u)", type); @@ -83121,9 +84327,9 @@ index 000000000..481f9de6e + prt_printf(out, " (size %zu):", vstruct_bytes(f)); + prt_newline(out); + -+ if (ops && ops->to_text) { ++ if (ops->to_text) { + printbuf_indent_add(out, 2); -+ bch2_sb_field_ops[type]->to_text(out, sb, f); ++ ops->to_text(out, sb, f); + printbuf_indent_sub(out, 2); + } +} @@ -83197,6 +84403,11 @@ index 000000000..481f9de6e + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_newline(out); + ++ prt_str(out, "Version upgrade complete:"); ++ prt_tab(out); ++ bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); ++ prt_newline(out); ++ + prt_printf(out, "Oldest version on disk:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(sb->version_min)); @@ -83288,10 +84499,10 @@ index 000000000..481f9de6e +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 -index 000000000..cda71ec84 +index 000000000..904adea6a --- /dev/null +++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,134 @@ +@@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H @@ -83305,11 +84516,16 @@ index 000000000..cda71ec84 + +static inline bool bch2_version_compatible(u16 version) +{ -+ return version <= bcachefs_metadata_version_current && ++ return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && + version >= bcachefs_metadata_version_min; +} + +void bch2_version_to_text(struct printbuf *, unsigned); ++unsigned bch2_latest_compatible_version(unsigned); ++ ++u64 bch2_upgrade_recovery_passes(struct bch_fs *c, ++ unsigned, ++ unsigned); + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); +struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, @@ -83417,6 +84633,9 @@ index 000000000..cda71ec84 + +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); + ++void bch2_sb_maybe_downgrade(struct bch_fs *); ++void bch2_sb_upgrade(struct bch_fs *, unsigned); ++ +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + @@ -83428,10 +84647,10 @@ index 000000000..cda71ec84 +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 -index 000000000..bcd28a2d3 +index 000000000..9f1047a76 --- /dev/null +++ b/fs/bcachefs/super.c -@@ -0,0 +1,1995 @@ +@@ -0,0 +1,2006 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and @@ -83778,6 +84997,19 @@ index 000000000..bcd28a2d3 +{ + int ret; + ++ /* ++ * Data move operations can't run until after check_snapshots has ++ * completed, and bch2_snapshot_is_ancestor() is available. ++ * ++ * Ideally we'd start copygc/rebalance earlier instead of waiting for ++ * all of recovery/fsck to complete: ++ */ ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; ++ } ++ + ret = bch2_rebalance_start(c); + if (ret) { + bch_err(c, "error starting rebalance thread"); @@ -83795,20 +85027,21 @@ index 000000000..bcd28a2d3 + + if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + bch_err(c, "cannot go rw, unfixed btree errors"); -+ return -EROFS; ++ return -BCH_ERR_erofs_unfixed_errors; + } + + if (test_bit(BCH_FS_RW, &c->flags)) + return 0; + ++ if (c->opts.norecovery) ++ return -BCH_ERR_erofs_norecovery; ++ + /* + * nochanges is used for fsck -n mode - we have to allow going rw + * during recovery for that to work: + */ -+ if (c->opts.norecovery || -+ (c->opts.nochanges && -+ (!early || c->opts.read_only))) -+ return -EROFS; ++ if (c->opts.nochanges && (!early || c->opts.read_only)) ++ return -BCH_ERR_erofs_nochanges; + + bch_info(c, "going read-write"); + @@ -83836,12 +85069,6 @@ index 000000000..bcd28a2d3 + return ret; + } + -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err(c, "error starting copygc thread"); -+ return ret; -+ } -+ + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) @@ -83919,6 +85146,7 @@ index 000000000..bcd28a2d3 + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); + ++ darray_exit(&c->btree_roots_extra); + free_percpu(c->btree_paths_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); @@ -83994,9 +85222,12 @@ index 000000000..bcd28a2d3 + + cancel_work_sync(&c->read_only_work); + -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_free_super(&c->devs[i]->disk_sb); ++ for (i = 0; i < c->sb.nr_devices; i++) { ++ struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); ++ ++ if (ca) ++ bch2_free_super(&ca->disk_sb); ++ } +} + +void bch2_fs_free(struct bch_fs *c) @@ -84083,8 +85314,6 @@ index 000000000..bcd28a2d3 + unsigned i, iter_size; + int ret = 0; + -+ pr_verbose_init(opts, ""); -+ + c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + if (!c) { + c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); @@ -84186,11 +85415,11 @@ index 000000000..bcd28a2d3 + goto err; + + /* Compat: */ -+ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + -+ if (sb->version <= bcachefs_metadata_version_inode_v2 && ++ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); + @@ -84295,7 +85524,6 @@ index 000000000..bcd28a2d3 + if (ret) + goto err; +out: -+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); + return c; +err: + bch2_fs_free(c); @@ -84611,8 +85839,6 @@ index 000000000..bcd28a2d3 + struct bch_dev *ca = NULL; + int ret = 0; + -+ pr_verbose_init(c->opts, ""); -+ + if (bch2_fs_init_fault("dev_alloc")) + goto err; + @@ -84623,14 +85849,11 @@ index 000000000..bcd28a2d3 + ca->fs = c; + + bch2_dev_attach(c, ca, dev_idx); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); + return ret; +err: + if (ca) + bch2_dev_free(ca); -+ ret = -BCH_ERR_ENOMEM_dev_alloc; -+ goto out; ++ return -BCH_ERR_ENOMEM_dev_alloc; +} + +static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -84957,6 +86180,17 @@ index 000000000..bcd28a2d3 + bch2_dev_free(ca); + + /* ++ * At this point the device object has been removed in-core, but the ++ * on-disk journal might still refer to the device index via sb device ++ * usage entries. Recovery fails if it sees usage information for an ++ * invalid device. Flush journal pins to push the back of the journal ++ * past now invalid device index references before we update the ++ * superblock, but after the device object has been removed so any ++ * further journal writes elide usage info for the device. ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ /* + * Free this device's slot in the bch_member array - all pointers to + * this device must be gone: + */ @@ -85297,8 +86531,6 @@ index 000000000..bcd28a2d3 + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + -+ pr_verbose_init(opts, ""); -+ + if (!nr_devices) { + ret = -EINVAL; + goto err; @@ -85370,8 +86602,6 @@ index 000000000..bcd28a2d3 + kfree(sb); + printbuf_exit(&errbuf); + module_put(THIS_MODULE); -+ pr_verbose_init(opts, "ret %s (%i)", bch2_err_str(PTR_ERR_OR_ZERO(c)), -+ PTR_ERR_OR_ZERO(c)); + return c; +err_print: + pr_err("bch_fs_open err opening %s: %s", @@ -85422,7 +86652,7 @@ index 000000000..bcd28a2d3 +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + -+unsigned bch2_metadata_version = bcachefs_metadata_version_current; ++static unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + +module_exit(bcachefs_exit); @@ -85758,7 +86988,7 @@ index 000000000..89419fc79 +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 -index 000000000..3145a0ad6 +index 000000000..740305e67 --- /dev/null +++ b/fs/bcachefs/sysfs.c @@ -0,0 +1,1064 @@ @@ -85966,7 +87196,7 @@ index 000000000..3145a0ad6 +#ifdef BCH_WRITE_REF_DEBUG +read_attribute(write_refs); + -+const char * const bch2_write_refs[] = { ++static const char * const bch2_write_refs[] = { +#define x(n) #n, + BCH_WRITE_REFS() +#undef x @@ -86882,7 +88112,7 @@ index 000000000..222cd5062 +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 -index 000000000..50d69a563 +index 000000000..cef23d2cc --- /dev/null +++ b/fs/bcachefs/tests.c @@ -0,0 +1,939 @@ @@ -87332,7 +88562,7 @@ index 000000000..50d69a563 + +/* extent unit tests */ + -+u64 test_version; ++static u64 test_version; + +static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) @@ -89202,7 +90432,7 @@ index 000000000..905801772 +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 -index 000000000..38886bf62 +index 000000000..ae4f6de3c --- /dev/null +++ b/fs/bcachefs/util.c @@ -0,0 +1,1137 @@ @@ -89964,10 +91194,10 @@ index 000000000..38886bf62 + } +} + -+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) +{ + while (size) { -+ struct page *page = alloc_pages(gfp_mask, 0); ++ struct page *page = alloc_pages_noprof(gfp_mask, 0); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) @@ -90345,10 +91575,10 @@ index 000000000..38886bf62 +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 -index 000000000..70bde2e44 +index 000000000..5fa29dab3 --- /dev/null +++ b/fs/bcachefs/util.h -@@ -0,0 +1,842 @@ +@@ -0,0 +1,846 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H @@ -90411,12 +91641,13 @@ index 000000000..70bde2e44 + free_pages((unsigned long) p, get_order(size)); +} + -+static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) +{ -+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc(size, gfp_mask); ++ return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc_noprof(size, gfp_mask); +} ++#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) + +static inline void kvpfree(void *p, size_t size) +{ @@ -90426,12 +91657,13 @@ index 000000000..70bde2e44 + vpfree(p, size); +} + -+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) +{ + return size < PAGE_SIZE -+ ? kmalloc(size, gfp_mask) -+ : vpmalloc(size, gfp_mask); ++ ? kmalloc_noprof(size, gfp_mask) ++ : vpmalloc_noprof(size, gfp_mask); +} ++#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) + +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); + @@ -90881,7 +92113,9 @@ index 000000000..70bde2e44 +} + +void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); ++#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ ++ alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ @@ -91193,10 +92427,10 @@ index 000000000..70bde2e44 +#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 -index 000000000..5143b603b +index 000000000..ef030fc02 --- /dev/null +++ b/fs/bcachefs/varint.c -@@ -0,0 +1,121 @@ +@@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include @@ -91221,12 +92455,13 @@ index 000000000..5143b603b +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); ++ __le64 v_le; + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); -+ v = cpu_to_le64(v); -+ memcpy(out, &v, bytes); ++ v_le = cpu_to_le64(v); ++ memcpy(out, &v_le, bytes); + } else { + *out++ = 255; + bytes = 9; @@ -91256,9 +92491,9 @@ index 000000000..5143b603b + return -1; + + if (likely(bytes < 9)) { -+ v = 0; -+ memcpy(&v, in, bytes); -+ v = le64_to_cpu(v); ++ __le64 v_le = 0; ++ memcpy(&v_le, in, bytes); ++ v = le64_to_cpu(v_le); + v >>= bytes; + } else { + v = get_unaligned_le64(++in); @@ -91406,7 +92641,7 @@ index 000000000..53a694d71 +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 -index 000000000..05c65d94c +index 000000000..70f78006d --- /dev/null +++ b/fs/bcachefs/xattr.c @@ -0,0 +1,648 @@ @@ -91482,7 +92717,8 @@ index 000000000..05c65d94c +}; + +int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, -+ unsigned flags, struct printbuf *err) ++ enum bkey_invalid_flags flags, ++ struct printbuf *err) +{ + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); @@ -91547,15 +92783,14 @@ index 000000000..05c65d94c + const char *name, void *buffer, size_t size, int type) +{ + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); ++ struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_s_c k; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, -+ inode_inum(inode), -+ &X_SEARCH(type, name, strlen(name)), -+ 0); ++ inode_inum(inode), &search, 0); + if (ret) + goto err1; + @@ -91578,31 +92813,23 @@ index 000000000..05c65d94c + return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; +} + -+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, -+ const char *name, void *buffer, size_t size, int type) -+{ -+ return bch2_trans_do(c, NULL, NULL, 0, -+ bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); -+} -+ +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, ++ struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int type, int flags) +{ ++ struct bch_fs *c = trans->c; + struct btree_iter inode_iter = { NULL }; -+ struct bch_inode_unpacked inode_u; + int ret; + -+ /* -+ * We need to do an inode update so that bi_journal_sync gets updated -+ * and fsync works: -+ * -+ * Perhaps we should be updating bi_mtime too? -+ */ ++ ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); ++ if (ret) ++ return ret; + -+ ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: -+ bch2_inode_write(trans, &inode_iter, &inode_u); ++ inode_u->bi_ctime = bch2_current_time(c); ++ ++ ret = bch2_inode_write(trans, &inode_iter, inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) @@ -91777,9 +93004,9 @@ index 000000000..05c65d94c +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret; ++ int ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); + -+ ret = bch2_xattr_get(c, inode, name, buffer, size, handler->flags); + return bch2_err_class(ret); +} + @@ -91792,12 +93019,20 @@ index 000000000..05c65d94c + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); ++ struct bch_inode_unpacked inode_u; ++ struct btree_trans trans; + int ret; + -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_xattr_set(&trans, inode_inum(inode), &hash, -+ name, value, size, ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = commit_do(&trans, NULL, NULL, 0, ++ bch2_xattr_set(&trans, inode_inum(inode), &inode_u, ++ &hash, name, value, size, + handler->flags, flags)); ++ if (!ret) ++ bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); ++ bch2_trans_exit(&trans); ++ + return bch2_err_class(ret); +} + @@ -92060,10 +93295,10 @@ index 000000000..05c65d94c +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 -index 000000000..b3e16729b +index 000000000..f5a52e3a6 --- /dev/null +++ b/fs/bcachefs/xattr.h -@@ -0,0 +1,51 @@ +@@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_H +#define _BCACHEFS_XATTR_H @@ -92072,7 +93307,8 @@ index 000000000..b3e16729b + +extern const struct bch_hash_desc bch2_xattr_hash_desc; + -+int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); ++int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, ++ enum bkey_invalid_flags, struct printbuf *); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ @@ -92103,11 +93339,9 @@ index 000000000..b3e16729b +struct bch_hash_info; +struct bch_inode_info; + -+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, -+ const char *, void *, size_t, int); -+ ++/* Exported for cmd_migrate.c in tools: */ +int bch2_xattr_set(struct btree_trans *, subvol_inum, -+ const struct bch_hash_info *, ++ struct bch_inode_unpacked *, const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + +ssize_t bch2_xattr_list(struct dentry *, char *, size_t); @@ -92877,6 +94111,214 @@ index 4120bd1cb..83a0a043b 100644 default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; +diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h +new file mode 100644 +index 000000000..16fbf74ed +--- /dev/null ++++ b/include/asm-generic/codetag.lds.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef __ASM_GENERIC_CODETAG_LDS_H ++#define __ASM_GENERIC_CODETAG_LDS_H ++ ++#define SECTION_WITH_BOUNDARIES(_name) \ ++ . = ALIGN(8); \ ++ __start_##_name = .; \ ++ KEEP(*(_name)) \ ++ __stop_##_name = .; ++ ++#define CODETAG_SECTIONS() \ ++ SECTION_WITH_BOUNDARIES(alloc_tags) \ ++ SECTION_WITH_BOUNDARIES(dynamic_fault_tags) ++ ++#endif /* __ASM_GENERIC_CODETAG_LDS_H */ +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index da9e5629e..47dd57ca7 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -50,6 +50,8 @@ + * [__nosave_begin, __nosave_end] for the nosave data + */ + ++#include ++ + #ifndef LOAD_OFFSET + #define LOAD_OFFSET 0 + #endif +@@ -374,6 +376,7 @@ + . = ALIGN(8); \ + BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ + BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ ++ CODETAG_SECTIONS() \ + LIKELY_PROFILE() \ + BRANCH_PROFILE() \ + TRACE_PRINTKS() \ +diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h +new file mode 100644 +index 000000000..6c1b7e1dc +--- /dev/null ++++ b/include/linux/alloc_tag.h +@@ -0,0 +1,160 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * allocation tagging ++ */ ++#ifndef _LINUX_ALLOC_TAG_H ++#define _LINUX_ALLOC_TAG_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * An instance of this structure is created in a special ELF section at every ++ * allocation callsite. At runtime, the special section is treated as ++ * an array of these. Embedded codetag utilizes codetag framework. ++ */ ++struct alloc_tag { ++ struct codetag ct; ++ u64 __percpu *bytes_allocated; ++} __aligned(8); ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ ++void alloc_tags_show_mem_report(struct seq_buf *s); ++ ++static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) ++{ ++ return container_of(ct, struct alloc_tag, ct); ++} ++ ++#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ ++ static struct alloc_tag _alloc_tag __used __aligned(8) \ ++ __section("alloc_tags") = { .ct = CODE_TAG_INIT }; \ ++ struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) ++ ++DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, ++ mem_alloc_profiling_key); ++ ++static inline bool mem_alloc_profiling_enabled(void) ++{ ++ return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, ++ &mem_alloc_profiling_key); ++} ++ ++static inline u64 alloc_tag_read(struct alloc_tag *tag) ++{ ++ u64 v = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ v += *per_cpu_ptr(tag->bytes_allocated, cpu); ++ ++ return v; ++} ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ ++#define CODETAG_EMPTY (void *)1 ++ ++static inline bool is_codetag_empty(union codetag_ref *ref) ++{ ++ return ref->ct == CODETAG_EMPTY; ++} ++ ++static inline void set_codetag_empty(union codetag_ref *ref) ++{ ++ if (ref) ++ ref->ct = CODETAG_EMPTY; ++} ++ ++#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ ++ ++static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } ++static inline void set_codetag_empty(union codetag_ref *ref) {} ++ ++#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ ++ ++static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) ++{ ++ struct alloc_tag *tag; ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); ++#endif ++ if (!ref || !ref->ct) ++ return; ++ ++ if (is_codetag_empty(ref)) { ++ ref->ct = NULL; ++ return; ++ } ++ ++ tag = ct_to_alloc_tag(ref->ct); ++ ++ this_cpu_add(*tag->bytes_allocated, -bytes); ++ ref->ct = NULL; ++} ++ ++static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) ++{ ++ __alloc_tag_sub(ref, bytes); ++} ++ ++static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) ++{ ++ __alloc_tag_sub(ref, bytes); ++} ++ ++static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) ++{ ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ WARN_ONCE(ref && ref->ct, ++ "alloc_tag was not cleared (got tag for %s:%u)\n",\ ++ ref->ct->filename, ref->ct->lineno); ++ ++ WARN_ONCE(!tag, "current->alloc_tag not set"); ++#endif ++ if (!ref || !tag) ++ return; ++ ++ ref->ct = &tag->ct; ++ this_cpu_add(*tag->bytes_allocated, bytes); ++} ++ ++#else ++ ++#define DEFINE_ALLOC_TAG(_alloc_tag, _old) ++static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} ++static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} ++static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, ++ size_t bytes) {} ++static inline void set_codetag_empty(union codetag_ref *ref) {} ++ ++#endif ++ ++typedef struct mempool_s mempool_t; ++ ++#define res_type_to_err(_res) _Generic((_res), \ ++ struct folio *: NULL, \ ++ struct page *: NULL, \ ++ mempool_t *: NULL, \ ++ void *: NULL, \ ++ unsigned long: 0, \ ++ int: -ENOMEM) ++ ++#define alloc_hooks(_do_alloc) \ ++({ \ ++ typeof(_do_alloc) _res; \ ++ DEFINE_ALLOC_TAG(_alloc_tag, _old); \ ++ \ ++ _res = !memory_fault() ? _do_alloc : res_type_to_err(_res); \ ++ alloc_tag_restore(&_alloc_tag, _old); \ ++ _res; \ ++}) ++ ++#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index b3e7529ff..f2620f8d1 100644 --- a/include/linux/bio.h @@ -93035,20 +94477,122 @@ index c88cdc4ae..722a586bb 100644 +} while (0) + #endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index e659cb6fd..e56793bc0 100644 ---- a/include/linux/compiler_attributes.h -+++ b/include/linux/compiler_attributes.h -@@ -366,4 +366,9 @@ - */ - #define __fix_address noinline __noclone - +diff --git a/include/linux/codetag.h b/include/linux/codetag.h +new file mode 100644 +index 000000000..87207f199 +--- /dev/null ++++ b/include/linux/codetag.h +@@ -0,0 +1,110 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ +/* -+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ * code tagging framework + */ -+#define __flatten __attribute__((flatten)) ++#ifndef _LINUX_CODETAG_H ++#define _LINUX_CODETAG_H + - #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ ++#include ++ ++struct codetag_iterator; ++struct codetag_type; ++struct seq_buf; ++struct module; ++ ++/* ++ * An instance of this structure is created in a special ELF section at every ++ * code location being tagged. At runtime, the special section is treated as ++ * an array of these. ++ */ ++struct codetag { ++ unsigned int flags; /* used in later patches */ ++ unsigned int lineno; ++ const char *modname; ++ const char *function; ++ const char *filename; ++} __aligned(8); ++ ++union codetag_ref { ++ struct codetag *ct; ++}; ++ ++struct codetag_range { ++ struct codetag *start; ++ struct codetag *stop; ++}; ++ ++struct codetag_module { ++ struct module *mod; ++ struct codetag_range range; ++}; ++ ++struct codetag_type_desc { ++ const char *section; ++ size_t tag_size; ++ void (*module_load)(struct codetag_type *cttype, ++ struct codetag_module *cmod); ++ bool (*module_unload)(struct codetag_type *cttype, ++ struct codetag_module *cmod); ++}; ++ ++struct codetag_iterator { ++ struct codetag_type *cttype; ++ struct codetag_module *cmod; ++ unsigned long mod_id; ++ struct codetag *ct; ++}; ++ ++#define CODE_TAG_INIT { \ ++ .modname = KBUILD_MODNAME, \ ++ .function = __func__, \ ++ .filename = __FILE__, \ ++ .lineno = __LINE__, \ ++ .flags = 0, \ ++} ++ ++void codetag_lock_module_list(struct codetag_type *cttype, bool lock); ++struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype); ++struct codetag *codetag_next_ct(struct codetag_iterator *iter); ++ ++void codetag_to_text(struct seq_buf *out, struct codetag *ct); ++ ++struct codetag_type * ++codetag_register_type(const struct codetag_type_desc *desc); ++ ++#ifdef CONFIG_CODE_TAGGING ++void codetag_load_module(struct module *mod); ++bool codetag_unload_module(struct module *mod); ++#else ++static inline void codetag_load_module(struct module *mod) {} ++static inline bool codetag_unload_module(struct module *mod) { return true; } ++#endif ++ ++/* Codetag query parsing */ ++ ++struct codetag_query { ++ const char *filename; ++ const char *module; ++ const char *function; ++ const char *class; ++ unsigned int first_line, last_line; ++ unsigned int first_index, last_index; ++ unsigned int cur_index; ++ ++ bool match_line:1; ++ bool match_index:1; ++ ++ unsigned int set_enabled:1; ++ unsigned int enabled:2; ++ ++ unsigned int set_frequency:1; ++ unsigned int frequency; ++}; ++ ++char *codetag_query_parse(struct codetag_query *q, char *buf); ++bool codetag_matches_query(struct codetag_query *q, ++ const struct codetag *ct, ++ const struct codetag_module *mod, ++ const char *class); ++ ++#endif /* _LINUX_CODETAG_H */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6b351e009..3da2f0545 100644 --- a/include/linux/dcache.h @@ -93061,6 +94605,104 @@ index 6b351e009..3da2f0545 100644 extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); +diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h +index 31f114f48..d741940dc 100644 +--- a/include/linux/dma-map-ops.h ++++ b/include/linux/dma-map-ops.h +@@ -27,7 +27,7 @@ struct dma_map_ops { + unsigned long attrs); + void (*free)(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_handle, unsigned long attrs); +- struct page *(*alloc_pages)(struct device *dev, size_t size, ++ struct page *(*alloc_pages_op)(struct device *dev, size_t size, + dma_addr_t *dma_handle, enum dma_data_direction dir, + gfp_t gfp); + void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, +diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h +new file mode 100644 +index 000000000..526a33209 +--- /dev/null ++++ b/include/linux/dynamic_fault.h +@@ -0,0 +1,79 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_DYNAMIC_FAULT_H ++#define _LINUX_DYNAMIC_FAULT_H ++ ++/* ++ * Dynamic/code tagging fault injection: ++ * ++ * Originally based on the dynamic debug trick of putting types in a special elf ++ * section, then rewritten using code tagging: ++ * ++ * To use, simply insert a call to dynamic_fault("fault_class"), which will ++ * return true if an error should be injected. ++ * ++ * Fault injection sites may be listed and enabled via debugfs, under ++ * /sys/kernel/debug/dynamic_faults. ++ */ ++ ++#ifdef CONFIG_CODETAG_FAULT_INJECTION ++ ++#include ++#include ++ ++#define DFAULT_STATES() \ ++ x(disabled) \ ++ x(enabled) \ ++ x(oneshot) ++ ++enum dfault_enabled { ++#define x(n) DFAULT_##n, ++ DFAULT_STATES() ++#undef x ++}; ++ ++union dfault_state { ++ struct { ++ unsigned int enabled:2; ++ unsigned int count:30; ++ }; ++ ++ struct { ++ unsigned int v; ++ }; ++}; ++ ++struct dfault { ++ struct codetag tag; ++ const char *class; ++ unsigned int frequency; ++ union dfault_state state; ++ struct static_key_false enabled; ++}; ++ ++bool __dynamic_fault_enabled(struct dfault *df); ++ ++#define dynamic_fault(_class) \ ++({ \ ++ static struct dfault \ ++ __used \ ++ __section("dynamic_fault_tags") \ ++ __aligned(8) df = { \ ++ .tag = CODE_TAG_INIT, \ ++ .class = _class, \ ++ .enabled = STATIC_KEY_FALSE_INIT, \ ++ }; \ ++ \ ++ static_key_false(&df.enabled.key) && \ ++ __dynamic_fault_enabled(&df); \ ++}) ++ ++#else ++ ++#define dynamic_fault(_class) false ++ ++#endif /* CODETAG_FAULT_INJECTION */ ++ ++#define memory_fault() dynamic_fault("memory") ++ ++#endif /* _LINUX_DYNAMIC_FAULT_H */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 9edb29101..4bf7c8466 100644 --- a/include/linux/exportfs.h @@ -93078,8 +94720,32 @@ index 9edb29101..4bf7c8466 100644 /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) +diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h +index c9de1f59e..6f36fff09 100644 +--- a/include/linux/fortify-string.h ++++ b/include/linux/fortify-string.h +@@ -689,9 +689,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) + return __real_memchr_inv(p, c, size); + } + +-extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) ++extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) + __realloc_size(2); +-__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) ++__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) + { + size_t p_size = __struct_size(p); + +@@ -701,6 +701,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp + fortify_panic(__func__); + return __real_kmemdup(p, size, gfp); + } ++#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) + + /** + * strcpy - Copy a string into another string buffer diff --git a/include/linux/fs.h b/include/linux/fs.h -index 133f0640f..2681e6295 100644 +index 133f0640f..f04872975 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,7 +664,8 @@ struct inode { @@ -93110,7 +94776,20 @@ index 133f0640f..2681e6295 100644 } /* -@@ -2714,7 +2715,7 @@ static inline void insert_inode_hash(struct inode *inode) +@@ -2699,11 +2700,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, + * This must be used for allocating filesystems specific inodes to set + * up the inode reclaim context correctly. + */ +-static inline void * +-alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) +-{ +- return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); +-} ++#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) + + extern void __insert_inode_hash(struct inode *, unsigned long hashval); + static inline void insert_inode_hash(struct inode *inode) +@@ -2714,7 +2711,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { @@ -93119,7 +94798,7 @@ index 133f0640f..2681e6295 100644 __remove_inode_hash(inode); } -@@ -2897,6 +2898,7 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs); +@@ -2897,6 +2894,7 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); @@ -93238,6 +94917,371 @@ index 107613f7d..c74b73769 100644 int __genradix_prealloc(struct __genradix *, size_t, gfp_t); /** +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index ed8cb537c..495745c99 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -6,6 +6,8 @@ + + #include + #include ++#include ++#include + + struct vm_area_struct; + +@@ -174,42 +176,43 @@ static inline void arch_free_page(struct page *page, int order) { } + static inline void arch_alloc_page(struct page *page, int order) { } + #endif + +-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, ++struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); +-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, ++#define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) ++ ++struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask); ++#define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) + +-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, ++unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array); ++#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) + +-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, ++unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, + unsigned long nr_pages, + struct page **page_array); ++#define alloc_pages_bulk_array_mempolicy(...) alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) + + /* Bulk allocate order-0 pages */ +-static inline unsigned long +-alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) +-{ +- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); +-} ++#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ ++ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) + +-static inline unsigned long +-alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) +-{ +- return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); +-} ++#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ ++ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) + + static inline unsigned long +-alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) ++alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) + { + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + +- return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); ++ return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); + } + ++#define alloc_pages_bulk_array_node(...) alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) ++ + static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) + { + gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); +@@ -229,21 +232,23 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) + * online. For more general interface, see alloc_pages_node(). + */ + static inline struct page * +-__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) ++__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) + { + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + warn_if_node_offline(nid, gfp_mask); + +- return __alloc_pages(gfp_mask, order, nid, NULL); ++ return __alloc_pages_noprof(gfp_mask, order, nid, NULL); + } + ++#define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) ++ + static inline + struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) + { + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + warn_if_node_offline(nid, gfp); + +- return __folio_alloc(gfp, order, nid, NULL); ++ return __folio_alloc_noprof(gfp, order, nid, NULL); + } + + /* +@@ -251,53 +256,69 @@ struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) + * prefer the current CPU's closest node. Otherwise node must be valid and + * online. + */ +-static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, +- unsigned int order) ++static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, ++ unsigned int order) + { + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + +- return __alloc_pages_node(nid, gfp_mask, order); ++ return __alloc_pages_node_noprof(nid, gfp_mask, order); + } + ++#define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) ++ + #ifdef CONFIG_NUMA +-struct page *alloc_pages(gfp_t gfp, unsigned int order); +-struct folio *folio_alloc(gfp_t gfp, unsigned order); +-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, ++struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); ++struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); ++struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage); + #else +-static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) ++static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) + { +- return alloc_pages_node(numa_node_id(), gfp_mask, order); ++ return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); + } +-static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) ++static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) + { + return __folio_alloc_node(gfp, order, numa_node_id()); + } +-#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ +- folio_alloc(gfp, order) ++#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ ++ folio_alloc_noprof(gfp, order) + #endif ++ ++#define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) ++#define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) ++#define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) ++ + #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +-static inline struct page *alloc_page_vma(gfp_t gfp, ++ ++static inline struct page *alloc_page_vma_noprof(gfp_t gfp, + struct vm_area_struct *vma, unsigned long addr) + { +- struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); ++ struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); + + return &folio->page; + } ++#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) ++ ++extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); ++#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) + +-extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); +-extern unsigned long get_zeroed_page(gfp_t gfp_mask); ++extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); ++#define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) ++ ++void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); ++#define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); + void free_pages_exact(void *virt, size_t size); +-__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); + +-#define __get_free_page(gfp_mask) \ +- __get_free_pages((gfp_mask), 0) ++__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); ++#define alloc_pages_exact_nid(...) alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) ++ ++#define __get_free_page(gfp_mask) \ ++ __get_free_pages((gfp_mask), 0) + +-#define __get_dma_pages(gfp_mask, order) \ +- __get_free_pages((gfp_mask) | GFP_DMA, (order)) ++#define __get_dma_pages(gfp_mask, order) \ ++ __get_free_pages((gfp_mask) | GFP_DMA, (order)) + + extern void __free_pages(struct page *page, unsigned int order); + extern void free_pages(unsigned long addr, unsigned int order); +@@ -354,10 +375,14 @@ static inline bool pm_suspended_storage(void) + + #ifdef CONFIG_CONTIG_ALLOC + /* The below functions must be run on a range from a single zone. */ +-extern int alloc_contig_range(unsigned long start, unsigned long end, ++extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, + unsigned migratetype, gfp_t gfp_mask); +-extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, +- int nid, nodemask_t *nodemask); ++#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) ++ ++extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, ++ int nid, nodemask_t *nodemask); ++#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) ++ + #endif + void free_contig_range(unsigned long pfn, unsigned long nr_pages); + +diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h +index 6583a5867..1c6573d69 100644 +--- a/include/linux/gfp_types.h ++++ b/include/linux/gfp_types.h +@@ -21,44 +21,86 @@ typedef unsigned int __bitwise gfp_t; + * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c + */ + ++enum { ++ ___GFP_DMA_BIT, ++ ___GFP_HIGHMEM_BIT, ++ ___GFP_DMA32_BIT, ++ ___GFP_MOVABLE_BIT, ++ ___GFP_RECLAIMABLE_BIT, ++ ___GFP_HIGH_BIT, ++ ___GFP_IO_BIT, ++ ___GFP_FS_BIT, ++ ___GFP_ZERO_BIT, ++ ___GFP_UNUSED_BIT, /* 0x200u unused */ ++ ___GFP_DIRECT_RECLAIM_BIT, ++ ___GFP_KSWAPD_RECLAIM_BIT, ++ ___GFP_WRITE_BIT, ++ ___GFP_NOWARN_BIT, ++ ___GFP_RETRY_MAYFAIL_BIT, ++ ___GFP_NOFAIL_BIT, ++ ___GFP_NORETRY_BIT, ++ ___GFP_MEMALLOC_BIT, ++ ___GFP_COMP_BIT, ++ ___GFP_NOMEMALLOC_BIT, ++ ___GFP_HARDWALL_BIT, ++ ___GFP_THISNODE_BIT, ++ ___GFP_ACCOUNT_BIT, ++ ___GFP_ZEROTAGS_BIT, ++#ifdef CONFIG_KASAN_HW_TAGS ++ ___GFP_SKIP_ZERO_BIT, ++ ___GFP_SKIP_KASAN_BIT, ++#endif ++#ifdef CONFIG_LOCKDEP ++ ___GFP_NOLOCKDEP_BIT, ++#endif ++#ifdef CONFIG_SLAB_OBJ_EXT ++ ___GFP_NO_OBJ_EXT_BIT, ++#endif ++ ___GFP_LAST_BIT ++}; ++ + /* Plain integer GFP bitmasks. Do not use this directly. */ +-#define ___GFP_DMA 0x01u +-#define ___GFP_HIGHMEM 0x02u +-#define ___GFP_DMA32 0x04u +-#define ___GFP_MOVABLE 0x08u +-#define ___GFP_RECLAIMABLE 0x10u +-#define ___GFP_HIGH 0x20u +-#define ___GFP_IO 0x40u +-#define ___GFP_FS 0x80u +-#define ___GFP_ZERO 0x100u ++#define ___GFP_DMA BIT(___GFP_DMA_BIT) ++#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) ++#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) ++#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) ++#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) ++#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) ++#define ___GFP_IO BIT(___GFP_IO_BIT) ++#define ___GFP_FS BIT(___GFP_FS_BIT) ++#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) + /* 0x200u unused */ +-#define ___GFP_DIRECT_RECLAIM 0x400u +-#define ___GFP_KSWAPD_RECLAIM 0x800u +-#define ___GFP_WRITE 0x1000u +-#define ___GFP_NOWARN 0x2000u +-#define ___GFP_RETRY_MAYFAIL 0x4000u +-#define ___GFP_NOFAIL 0x8000u +-#define ___GFP_NORETRY 0x10000u +-#define ___GFP_MEMALLOC 0x20000u +-#define ___GFP_COMP 0x40000u +-#define ___GFP_NOMEMALLOC 0x80000u +-#define ___GFP_HARDWALL 0x100000u +-#define ___GFP_THISNODE 0x200000u +-#define ___GFP_ACCOUNT 0x400000u +-#define ___GFP_ZEROTAGS 0x800000u ++#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) ++#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) ++#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) ++#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) ++#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) ++#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) ++#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) ++#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) ++#define ___GFP_COMP BIT(___GFP_COMP_BIT) ++#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) ++#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) ++#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) ++#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) ++#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) + #ifdef CONFIG_KASAN_HW_TAGS +-#define ___GFP_SKIP_ZERO 0x1000000u +-#define ___GFP_SKIP_KASAN 0x2000000u ++#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) ++#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) + #else + #define ___GFP_SKIP_ZERO 0 + #define ___GFP_SKIP_KASAN 0 + #endif + #ifdef CONFIG_LOCKDEP +-#define ___GFP_NOLOCKDEP 0x4000000u ++#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) + #else + #define ___GFP_NOLOCKDEP 0 + #endif +-/* If the above are modified, __GFP_BITS_SHIFT may need updating */ ++#ifdef CONFIG_SLAB_OBJ_EXT ++#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) ++#else ++#define ___GFP_NO_OBJ_EXT 0 ++#endif + + /* + * Physical address zone modifiers (see linux/mmzone.h - low four bits) +@@ -99,12 +141,15 @@ typedef unsigned int __bitwise gfp_t; + * node with no fallbacks or placement policy enforcements. + * + * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. ++ * ++ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. + */ + #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) + #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) + #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) + #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) + #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) ++#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT) + + /** + * DOC: Watermark modifiers +@@ -249,7 +294,7 @@ typedef unsigned int __bitwise gfp_t; + #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + + /* Room for N __GFP_FOO bits */ +-#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) ++#define __GFP_BITS_SHIFT ___GFP_LAST_BIT + #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) + + /** +diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h +index 0ee140176..e67349e84 100644 +--- a/include/linux/hrtimer.h ++++ b/include/linux/hrtimer.h +@@ -16,7 +16,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e..a774d074b 100644 --- a/include/linux/iomap.h @@ -93533,11 +95577,620 @@ index 000000000..647505010 +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ +diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h +index 222d73701..3eb8975c1 100644 +--- a/include/linux/memcontrol.h ++++ b/include/linux/memcontrol.h +@@ -339,15 +339,32 @@ struct mem_cgroup { + extern struct mem_cgroup *root_mem_cgroup; + + enum page_memcg_data_flags { +- /* page->memcg_data is a pointer to an objcgs vector */ +- MEMCG_DATA_OBJCGS = (1UL << 0), ++ /* page->memcg_data is a pointer to an slabobj_ext vector */ ++ MEMCG_DATA_OBJEXTS = (1UL << 0), + /* page has been accounted as a non-slab kernel page */ + MEMCG_DATA_KMEM = (1UL << 1), + /* the next bit after the last actual flag */ + __NR_MEMCG_DATA_FLAGS = (1UL << 2), + }; + +-#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) ++#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS ++ ++#else /* CONFIG_MEMCG */ ++ ++#define __FIRST_OBJEXT_FLAG (1UL << 0) ++ ++#endif /* CONFIG_MEMCG */ ++ ++enum objext_flags { ++ /* slabobj_ext vector failed to allocate */ ++ OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, ++ /* the next bit after the last actual flag */ ++ __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), ++}; ++ ++#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) ++ ++#ifdef CONFIG_MEMCG + + static inline bool folio_memcg_kmem(struct folio *folio); + +@@ -378,10 +395,10 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) + unsigned long memcg_data = folio->memcg_data; + + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); +- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); ++ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); + +- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + } + + /* +@@ -399,10 +416,10 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) + unsigned long memcg_data = folio->memcg_data; + + VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); +- VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); ++ VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); + VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); + +- return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + } + + /* +@@ -459,11 +476,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + +- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + +- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + } + + /* +@@ -496,17 +513,17 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) + */ + unsigned long memcg_data = READ_ONCE(folio->memcg_data); + +- if (memcg_data & MEMCG_DATA_OBJCGS) ++ if (memcg_data & MEMCG_DATA_OBJEXTS) + return NULL; + + if (memcg_data & MEMCG_DATA_KMEM) { + struct obj_cgroup *objcg; + +- objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + return obj_cgroup_memcg(objcg); + } + +- return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++ return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); + } + + static inline struct mem_cgroup *page_memcg_check(struct page *page) +@@ -542,7 +559,7 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob + static inline bool folio_memcg_kmem(struct folio *folio) + { + VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); +- VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); ++ VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); + return folio->memcg_data & MEMCG_DATA_KMEM; + } + +@@ -1606,6 +1623,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, + } + #endif /* CONFIG_MEMCG */ + ++/* ++ * Extended information for slab objects stored as an array in page->memcg_data ++ * if MEMCG_DATA_OBJEXTS is set. ++ */ ++struct slabobj_ext { ++#ifdef CONFIG_MEMCG_KMEM ++ struct obj_cgroup *objcg; ++#endif ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ union codetag_ref ref; ++#endif ++} __aligned(8); ++ + static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) + { + __mod_lruvec_kmem_state(p, idx, 1); +diff --git a/include/linux/mempool.h b/include/linux/mempool.h +index 4aae6c06c..9fa126aa1 100644 +--- a/include/linux/mempool.h ++++ b/include/linux/mempool.h +@@ -5,6 +5,8 @@ + #ifndef _LINUX_MEMPOOL_H + #define _LINUX_MEMPOOL_H + ++#include ++#include + #include + #include + +@@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool); + int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id); +-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, ++ ++int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); ++#define mempool_init(...) \ ++ alloc_hooks(mempool_init_noprof(__VA_ARGS__)) + + extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data); +-extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, ++ ++extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int nid); ++#define mempool_create_node(...) \ ++ alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) ++ ++#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data) \ ++ mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ ++ GFP_KERNEL, NUMA_NO_NODE) + + extern int mempool_resize(mempool_t *pool, int new_min_nr); + extern void mempool_destroy(mempool_t *pool); +-extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; ++ ++extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; ++#define mempool_alloc(...) \ ++ alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) ++ + extern void mempool_free(void *element, mempool_t *pool); + + /* +@@ -61,19 +77,10 @@ extern void mempool_free(void *element, mempool_t *pool); + void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); + void mempool_free_slab(void *element, void *pool_data); + +-static inline int +-mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) +-{ +- return mempool_init(pool, min_nr, mempool_alloc_slab, +- mempool_free_slab, (void *) kc); +-} +- +-static inline mempool_t * +-mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) +-{ +- return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab, +- (void *) kc); +-} ++#define mempool_init_slab_pool(_pool, _min_nr, _kc) \ ++ mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) ++#define mempool_create_slab_pool(_min_nr, _kc) \ ++ mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) + + /* + * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the +@@ -82,17 +89,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) + void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); + void mempool_kfree(void *element, void *pool_data); + +-static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) +-{ +- return mempool_init(pool, min_nr, mempool_kmalloc, +- mempool_kfree, (void *) size); +-} +- +-static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) +-{ +- return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, +- (void *) size); +-} ++#define mempool_init_kmalloc_pool(_pool, _min_nr, _size) \ ++ mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree, \ ++ (void *)(unsigned long)(_size)) ++#define mempool_create_kmalloc_pool(_min_nr, _size) \ ++ mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ ++ (void *)(unsigned long)(_size)) + + /* + * A mempool_alloc_t and mempool_free_t for a simple page allocator that +@@ -101,16 +103,11 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) + void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); + void mempool_free_pages(void *element, void *pool_data); + +-static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) +-{ +- return mempool_init(pool, min_nr, mempool_alloc_pages, +- mempool_free_pages, (void *)(long)order); +-} +- +-static inline mempool_t *mempool_create_page_pool(int min_nr, int order) +-{ +- return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, +- (void *)(long)order); +-} ++#define mempool_init_page_pool(_pool, _min_nr, _order) \ ++ mempool_init(_pool, (_min_nr), mempool_alloc_pages, \ ++ mempool_free_pages, (void *)(long)(_order)) ++#define mempool_create_page_pool(_min_nr, _order) \ ++ mempool_create((_min_nr), mempool_alloc_pages, \ ++ mempool_free_pages, (void *)(long)(_order)) + + #endif /* _LINUX_MEMPOOL_H */ +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 3c6c4c836..88b45fb4f 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -5,6 +5,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -2925,6 +2926,13 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); + /* Free the reserved page into the buddy system, so it gets managed. */ + static inline void free_reserved_page(struct page *page) + { ++ union codetag_ref *ref; ++ ++ ref = get_page_tag_ref(page); ++ if (ref) { ++ set_codetag_empty(ref); ++ put_page_tag_ref(ref); ++ } + ClearPageReserved(page); + init_page_count(page); + __free_page(page); +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 306a3d1a0..e79303e1e 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -194,7 +194,7 @@ struct page { + /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ + atomic_t _refcount; + +-#ifdef CONFIG_MEMCG ++#ifdef CONFIG_SLAB_OBJ_EXT + unsigned long memcg_data; + #endif + +@@ -320,7 +320,7 @@ struct folio { + void *private; + atomic_t _mapcount; + atomic_t _refcount; +-#ifdef CONFIG_MEMCG ++#ifdef CONFIG_SLAB_OBJ_EXT + unsigned long memcg_data; + #endif + /* private: the union with struct page is transitional */ +diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h +index bb0ee8052..fda37b6df 100644 +--- a/include/linux/nodemask.h ++++ b/include/linux/nodemask.h +@@ -93,10 +93,10 @@ + #include + #include + #include ++#include + #include + #include + +-typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; + extern nodemask_t _unused_nodemask_arg_; + + /** +diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h +new file mode 100644 +index 000000000..84c2f47c4 +--- /dev/null ++++ b/include/linux/nodemask_types.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __LINUX_NODEMASK_TYPES_H ++#define __LINUX_NODEMASK_TYPES_H ++ ++#include ++ ++typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; ++ ++#endif /* __LINUX_NODEMASK_TYPES_H */ +diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h +index 67314f648..cff15ee54 100644 +--- a/include/linux/page_ext.h ++++ b/include/linux/page_ext.h +@@ -4,7 +4,6 @@ + + #include + #include +-#include + + struct pglist_data; + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 08328b579..347ba7f86 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -467,14 +467,17 @@ static inline void *detach_page_private(struct page *page) + } + + #ifdef CONFIG_NUMA +-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); ++struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); + #else +-static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) ++static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) + { +- return folio_alloc(gfp, order); ++ return folio_alloc_noprof(gfp, order); + } + #endif + ++#define filemap_alloc_folio(...) \ ++ alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__)) ++ + static inline struct page *__page_cache_alloc(gfp_t gfp) + { + return &filemap_alloc_folio(gfp, 0)->page; +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 1338ea2aa..dc50dedb0 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -2,12 +2,14 @@ + #ifndef __LINUX_PERCPU_H + #define __LINUX_PERCPU_H + ++#include + #include + #include + #include + #include + #include + #include ++#include + + #include + +@@ -116,7 +118,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -124,10 +125,15 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); +-extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); +-extern void free_percpu(void __percpu *__pdata); +-extern phys_addr_t per_cpu_ptr_to_phys(void *addr); ++extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, ++ gfp_t gfp) __alloc_size(1); ++ ++#define __alloc_percpu_gfp(_size, _align, _gfp) \ ++ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) ++#define __alloc_percpu(_size, _align) \ ++ alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL)) ++#define __alloc_reserved_percpu(_size, _align) \ ++ alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL)) + + #define alloc_percpu_gfp(type, gfp) \ + (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ +@@ -136,6 +142,9 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ + __alignof__(type)) + ++extern void free_percpu(void __percpu *__pdata); ++extern phys_addr_t per_cpu_ptr_to_phys(void *addr); ++ + extern unsigned long pcpu_nr_pages(void); + + #endif /* __LINUX_PERCPU_H */ +diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h +new file mode 100644 +index 000000000..ae9b0f359 +--- /dev/null ++++ b/include/linux/pgalloc_tag.h +@@ -0,0 +1,105 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * page allocation tagging ++ */ ++#ifndef _LINUX_PGALLOC_TAG_H ++#define _LINUX_PGALLOC_TAG_H ++ ++#include ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ ++#include ++ ++extern struct page_ext_operations page_alloc_tagging_ops; ++extern struct page_ext *page_ext_get(struct page *page); ++extern void page_ext_put(struct page_ext *page_ext); ++ ++static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) ++{ ++ return (void *)page_ext + page_alloc_tagging_ops.offset; ++} ++ ++static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) ++{ ++ return (void *)ref - page_alloc_tagging_ops.offset; ++} ++ ++static inline union codetag_ref *get_page_tag_ref(struct page *page) ++{ ++ if (page && mem_alloc_profiling_enabled()) { ++ struct page_ext *page_ext = page_ext_get(page); ++ ++ if (page_ext) ++ return codetag_ref_from_page_ext(page_ext); ++ } ++ return NULL; ++} ++ ++static inline void put_page_tag_ref(union codetag_ref *ref) ++{ ++ page_ext_put(page_ext_from_codetag_ref(ref)); ++} ++ ++static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, ++ unsigned int order) ++{ ++ union codetag_ref *ref = get_page_tag_ref(page); ++ ++ if (ref) { ++ alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE << order); ++ put_page_tag_ref(ref); ++ } ++} ++ ++static inline void pgalloc_tag_sub(struct page *page, unsigned int order) ++{ ++ union codetag_ref *ref = get_page_tag_ref(page); ++ ++ if (ref) { ++ alloc_tag_sub(ref, PAGE_SIZE << order); ++ put_page_tag_ref(ref); ++ } ++} ++ ++static inline void pgalloc_tag_split(struct page *page, unsigned int nr) ++{ ++ int i; ++ struct page_ext *page_ext; ++ union codetag_ref *ref; ++ struct alloc_tag *tag; ++ ++ if (!mem_alloc_profiling_enabled()) ++ return; ++ ++ page_ext = page_ext_get(page); ++ if (unlikely(!page_ext)) ++ return; ++ ++ ref = codetag_ref_from_page_ext(page_ext); ++ if (!ref->ct) ++ goto out; ++ ++ tag = ct_to_alloc_tag(ref->ct); ++ page_ext = page_ext_next(page_ext); ++ for (i = 1; i < nr; i++) { ++ /* New reference with 0 bytes accounted */ ++ alloc_tag_add(codetag_ref_from_page_ext(page_ext), tag, 0); ++ page_ext = page_ext_next(page_ext); ++ } ++out: ++ page_ext_put(page_ext); ++} ++ ++#else /* CONFIG_MEM_ALLOC_PROFILING */ ++ ++static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } ++static inline void put_page_tag_ref(union codetag_ref *ref) {} ++static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, ++ unsigned int order) {} ++static inline void pgalloc_tag_sub(struct page *page, unsigned int order) {} ++static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} ++ ++#endif /* CONFIG_MEM_ALLOC_PROFILING */ ++ ++#endif /* _LINUX_PGALLOC_TAG_H */ +diff --git a/include/linux/prandom.h b/include/linux/prandom.h +index f2ed5b72b..f7f1e5251 100644 +--- a/include/linux/prandom.h ++++ b/include/linux/prandom.h +@@ -10,7 +10,6 @@ + + #include + #include +-#include + #include + + struct rnd_state { +diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h +index 57467cbf4..92a8e670c 100644 +--- a/include/linux/rhashtable-types.h ++++ b/include/linux/rhashtable-types.h +@@ -9,6 +9,7 @@ + #ifndef _LINUX_RHASHTABLE_TYPES_H + #define _LINUX_RHASHTABLE_TYPES_H + ++#include + #include + #include + #include +@@ -88,6 +89,7 @@ struct rhashtable { + struct mutex mutex; + spinlock_t lock; + atomic_t nelems; ++ struct alloc_tag *alloc_tag; + }; + + /** +@@ -127,9 +129,12 @@ struct rhashtable_iter { + bool end_of_table; + }; + +-int rhashtable_init(struct rhashtable *ht, ++int rhashtable_init_noprof(struct rhashtable *ht, + const struct rhashtable_params *params); +-int rhltable_init(struct rhltable *hlt, ++#define rhashtable_init(...) alloc_hooks(rhashtable_init_noprof(__VA_ARGS__)) ++ ++int rhltable_init_noprof(struct rhltable *hlt, + const struct rhashtable_params *params); ++#define rhltable_init(...) alloc_hooks(rhltable_init_noprof(__VA_ARGS__)) + + #endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h -index eed5d65b8..5b1137c72 100644 +index 847332470..5c359b8b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -871,6 +871,7 @@ struct task_struct { +@@ -20,7 +20,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + #include +@@ -763,6 +763,10 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ struct alloc_tag *alloc_tag; ++#endif ++ + #ifdef CONFIG_SMP + int on_cpu; + struct __call_single_node wake_entry; +@@ -802,6 +806,7 @@ struct task_struct { + struct task_group *sched_task_group; + #endif + ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * Clamp values requested for a scheduling entity. +@@ -871,6 +876,7 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; @@ -93545,7 +96198,7 @@ index eed5d65b8..5b1137c72 100644 int exit_state; int exit_code; -@@ -1163,7 +1164,7 @@ struct task_struct { +@@ -1163,7 +1169,7 @@ struct task_struct { #endif #ifdef CONFIG_LOCKDEP @@ -93554,6 +96207,30 @@ index eed5d65b8..5b1137c72 100644 u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; +@@ -2446,4 +2452,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } + + extern void sched_set_stop_task(int cpu, struct task_struct *stop); + ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) ++{ ++ swap(current->alloc_tag, tag); ++ return tag; ++} ++ ++static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) ++{ ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); ++#endif ++ current->alloc_tag = old; ++} ++#else ++static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } ++#define alloc_tag_restore(_tag, _old) ++#endif ++ + #endif diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 515d7fcb9..cc02410f2 100644 --- a/include/linux/seq_buf.h @@ -94008,34 +96685,540 @@ index 000000000..394da423c +void six_lock_readers_add(struct six_lock *, int); + +#endif /* _LINUX_SIX_H */ +diff --git a/include/linux/slab.h b/include/linux/slab.h +index 6b3e155b7..f7bc3ab70 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -147,6 +147,13 @@ + #endif + #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ + ++#ifdef CONFIG_SLAB_OBJ_EXT ++/* Slab created using create_boot_cache */ ++#define SLAB_NO_OBJ_EXT ((slab_flags_t __force)0x20000000U) ++#else ++#define SLAB_NO_OBJ_EXT 0 ++#endif ++ + /* + * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. + * +@@ -206,7 +213,9 @@ int kmem_cache_shrink(struct kmem_cache *s); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); ++void * __must_check krealloc_noprof(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); ++#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) ++ + void kfree(const void *objp); + void kfree_sensitive(const void *objp); + size_t __ksize(const void *objp); +@@ -444,7 +453,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size, + static_assert(PAGE_SHIFT <= 20); + #define kmalloc_index(s) __kmalloc_index(s, true) + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); ++#include ++ ++void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); ++#define __kmalloc(...) alloc_hooks(__kmalloc_noprof(__VA_ARGS__)) + + /** + * kmem_cache_alloc - Allocate an object +@@ -456,9 +468,13 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz + * + * Return: pointer to the new object or %NULL in case of error + */ +-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; +-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, +- gfp_t gfpflags) __assume_slab_alignment __malloc; ++void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; ++#define kmem_cache_alloc(...) alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__)) ++ ++void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, ++ gfp_t gfpflags) __assume_slab_alignment __malloc; ++#define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) ++ + void kmem_cache_free(struct kmem_cache *s, void *objp); + + /* +@@ -469,29 +485,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp); + * Note that interrupts must be enabled when calling these functions. + */ + void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); +-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); ++ ++int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p); ++#define kmem_cache_alloc_bulk(...) alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__)) + + static __always_inline void kfree_bulk(size_t size, void **p) + { + kmem_cache_free_bulk(NULL, size, p); + } + +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment ++void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment + __alloc_size(1); +-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment +- __malloc; ++#define __kmalloc_node(...) alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__)) + +-void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) ++void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment ++ __malloc; ++#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) ++ ++void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size) + __assume_kmalloc_alignment __alloc_size(3); + +-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, +- int node, size_t size) __assume_kmalloc_alignment ++void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, ++ int node, size_t size) __assume_kmalloc_alignment + __alloc_size(4); +-void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment ++#define kmalloc_trace(...) alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__)) ++ ++#define kmalloc_node_trace(...) alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__)) ++ ++void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment + __alloc_size(1); ++#define kmalloc_large(...) alloc_hooks(kmalloc_large_noprof(__VA_ARGS__)) + +-void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment ++void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment + __alloc_size(1); ++#define kmalloc_large_node(...) alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__)) + + /** + * kmalloc - allocate kernel memory +@@ -547,37 +574,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align + * Try really hard to succeed the allocation but fail + * eventually. + */ +-static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size) && size) { + unsigned int index; + + if (size > KMALLOC_MAX_CACHE_SIZE) +- return kmalloc_large(size, flags); ++ return kmalloc_large_noprof(size, flags); + + index = kmalloc_index(size); +- return kmalloc_trace( ++ return kmalloc_trace_noprof( + kmalloc_caches[kmalloc_type(flags)][index], + flags, size); + } +- return __kmalloc(size, flags); ++ return __kmalloc_noprof(size, flags); + } ++#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) + +-static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node) + { + if (__builtin_constant_p(size) && size) { + unsigned int index; + + if (size > KMALLOC_MAX_CACHE_SIZE) +- return kmalloc_large_node(size, flags, node); ++ return kmalloc_large_node_noprof(size, flags, node); + + index = kmalloc_index(size); +- return kmalloc_node_trace( ++ return kmalloc_node_trace_noprof( + kmalloc_caches[kmalloc_type(flags)][index], + flags, node, size); + } +- return __kmalloc_node(size, flags, node); ++ return __kmalloc_node_noprof(size, flags, node); + } ++#define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__)) + + /** + * kmalloc_array - allocate memory for an array. +@@ -585,16 +614,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +-static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) ++static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags) + { + size_t bytes; + + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) +- return kmalloc(bytes, flags); +- return __kmalloc(bytes, flags); ++ return kmalloc_noprof(bytes, flags); ++ return kmalloc_noprof(bytes, flags); + } ++#define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) + + /** + * krealloc_array - reallocate memory for an array. +@@ -603,18 +633,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ + * @new_size: new size of a single member of the array + * @flags: the type of memory to allocate (see kmalloc) + */ +-static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, +- size_t new_n, +- size_t new_size, +- gfp_t flags) ++static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, ++ size_t new_n, ++ size_t new_size, ++ gfp_t flags) + { + size_t bytes; + + if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) + return NULL; + +- return krealloc(p, bytes, flags); ++ return krealloc_noprof(p, bytes, flags); + } ++#define krealloc_array(...) alloc_hooks(krealloc_array_noprof(__VA_ARGS__)) + + /** + * kcalloc - allocate memory for an array. The memory is set to zero. +@@ -622,16 +653,11 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +-static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) +-{ +- return kmalloc_array(n, size, flags | __GFP_ZERO); +-} ++#define kcalloc(_n, _size, _flags) kmalloc_array(_n, _size, (_flags) | __GFP_ZERO) + +-void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, ++void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node, + unsigned long caller) __alloc_size(1); +-#define kmalloc_node_track_caller(size, flags, node) \ +- __kmalloc_node_track_caller(size, flags, node, \ +- _RET_IP_) ++#define kmalloc_node_track_caller(...) alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_)) + + /* + * kmalloc_track_caller is a special version of kmalloc that records the +@@ -641,11 +667,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, + * allocator where we care about the real place the memory allocation + * request comes from. + */ +-#define kmalloc_track_caller(size, flags) \ +- __kmalloc_node_track_caller(size, flags, \ +- NUMA_NO_NODE, _RET_IP_) ++#define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE) + +-static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, ++static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, + int node) + { + size_t bytes; +@@ -653,75 +677,51 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, + if (unlikely(check_mul_overflow(n, size, &bytes))) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) +- return kmalloc_node(bytes, flags, node); +- return __kmalloc_node(bytes, flags, node); ++ return kmalloc_node_noprof(bytes, flags, node); ++ return __kmalloc_node_noprof(bytes, flags, node); + } ++#define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__)) + +-static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +-{ +- return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); +-} ++#define kcalloc_node(_n, _size, _flags, _node) kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node) + + /* + * Shortcuts + */ +-static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) +-{ +- return kmem_cache_alloc(k, flags | __GFP_ZERO); +-} ++#define kmem_cache_zalloc(_k, _flags) kmem_cache_alloc(_k, (_flags)|__GFP_ZERO) + + /** + * kzalloc - allocate memory. The memory is set to zero. + * @size: how many bytes of memory are required. + * @flags: the type of memory to allocate (see kmalloc). + */ +-static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) +-{ +- return kmalloc(size, flags | __GFP_ZERO); +-} +- +-/** +- * kzalloc_node - allocate zeroed memory from a particular memory node. +- * @size: how many bytes of memory are required. +- * @flags: the type of memory to allocate (see kmalloc). +- * @node: memory node from which to allocate +- */ +-static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) ++static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) + { +- return kmalloc_node(size, flags | __GFP_ZERO, node); ++ return kmalloc_noprof(size, flags | __GFP_ZERO); + } ++#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) ++#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); +-static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) +-{ +- return kvmalloc_node(size, flags, NUMA_NO_NODE); +-} +-static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) +-{ +- return kvmalloc_node(size, flags | __GFP_ZERO, node); +-} +-static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) +-{ +- return kvmalloc(size, flags | __GFP_ZERO); +-} ++extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1); ++#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) + +-static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +-{ +- size_t bytes; ++#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) ++#define kvzalloc(_size, _flags) kvmalloc(_size, _flags|__GFP_ZERO) + +- if (unlikely(check_mul_overflow(n, size, &bytes))) +- return NULL; ++#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, _flags|__GFP_ZERO, _node) + +- return kvmalloc(bytes, flags); +-} ++#define kvmalloc_array(_n, _size, _flags) \ ++({ \ ++ size_t _bytes; \ ++ \ ++ !check_mul_overflow(_n, _size, &_bytes) ? kvmalloc(_bytes, _flags) : NULL; \ ++}) + +-static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) +-{ +- return kvmalloc_array(n, size, flags | __GFP_ZERO); +-} ++#define kvcalloc(_n, _size, _flags) kvmalloc_array(_n, _size, _flags|__GFP_ZERO) + +-extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) ++extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + __realloc_size(3); ++#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) ++ + extern void kvfree(const void *addr); + extern void kvfree_sensitive(const void *addr, size_t len); + +diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h +index a61e7d55d..23f14dcb8 100644 +--- a/include/linux/slab_def.h ++++ b/include/linux/slab_def.h +@@ -107,7 +107,7 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla + * reciprocal_divide(offset, cache->reciprocal_buffer_size) + */ + static inline unsigned int obj_to_index(const struct kmem_cache *cache, +- const struct slab *slab, void *obj) ++ const struct slab *slab, const void *obj) + { + u32 offset = (obj - slab->s_mem); + return reciprocal_divide(offset, cache->reciprocal_buffer_size); +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index f6df03f93..e8be5b368 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -176,14 +176,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla + + /* Determine object index from a given position */ + static inline unsigned int __obj_to_index(const struct kmem_cache *cache, +- void *addr, void *obj) ++ void *addr, const void *obj) + { + return reciprocal_divide(kasan_reset_tag(obj) - addr, + cache->reciprocal_size); + } + + static inline unsigned int obj_to_index(const struct kmem_cache *cache, +- const struct slab *slab, void *obj) ++ const struct slab *slab, const void *obj) + { + if (is_kfence_address(obj)) + return 0; +diff --git a/include/linux/string.h b/include/linux/string.h +index c062c581a..198ca51ed 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -96,6 +96,7 @@ extern char * strpbrk(const char *,const char *); + #ifndef __HAVE_ARCH_STRSEP + extern char * strsep(char **,const char *); + #endif ++extern char *strsep_no_empty(char **, const char *); + #ifndef __HAVE_ARCH_STRSPN + extern __kernel_size_t strspn(const char *,const char *); + #endif +@@ -176,7 +177,9 @@ extern void kfree_const(const void *x); + extern char *kstrdup(const char *s, gfp_t gfp) __malloc; + extern const char *kstrdup_const(const char *s, gfp_t gfp); + extern char *kstrndup(const char *s, size_t len, gfp_t gfp); +-extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); ++extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2); ++#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) ++ + extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); + extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); + diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h -index fae6beaaa..44148f8fe 100644 +index fae6beaaa..ae51580b9 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h -@@ -23,8 +23,8 @@ enum string_size_units { - STRING_UNITS_2, /* use binary powers of 2^10 */ +@@ -16,15 +16,14 @@ static inline bool string_is_terminated(const char *s, int len) + return memchr(s, '\0', len) ? true : false; + } + +-/* Descriptions of the types of units to +- * print in */ +-enum string_size_units { +- STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ +- STRING_UNITS_2, /* use binary powers of 2^10 */ ++enum string_size_flags { ++ STRING_SIZE_BASE2 = (1 << 0), ++ STRING_SIZE_NOSPACE = (1 << 1), ++ STRING_SIZE_NOBYTES = (1 << 2), }; -void string_get_size(u64 size, u64 blk_size, enum string_size_units units, - char *buf, int len); -+int string_get_size(u64 size, u64 blk_size, enum string_size_units units, ++int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, + char *buf, int len); int parse_int_array_user(const char __user *from, size_t count, int **array); -diff --git a/include/linux/uio.h b/include/linux/uio.h -index 044c1d8c2..f4e29d949 100644 ---- a/include/linux/uio.h -+++ b/include/linux/uio.h -@@ -177,6 +177,8 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) +diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h +index bb9d3f554..d8e0cacfc 100644 +--- a/include/linux/time_namespace.h ++++ b/include/linux/time_namespace.h +@@ -11,6 +11,8 @@ + struct user_namespace; + extern struct user_namespace init_user_ns; + ++struct vm_area_struct; ++ + struct timens_offsets { + struct timespec64 monotonic; + struct timespec64 boottime; +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index c720be70c..106d78e75 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -2,6 +2,8 @@ + #ifndef _LINUX_VMALLOC_H + #define _LINUX_VMALLOC_H + ++#include ++#include + #include + #include + #include +@@ -137,26 +139,54 @@ extern unsigned long vmalloc_nr_pages(void); + static inline unsigned long vmalloc_nr_pages(void) { return 0; } + #endif + +-extern void *vmalloc(unsigned long size) __alloc_size(1); +-extern void *vzalloc(unsigned long size) __alloc_size(1); +-extern void *vmalloc_user(unsigned long size) __alloc_size(1); +-extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); +-extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); +-extern void *vmalloc_32(unsigned long size) __alloc_size(1); +-extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +-extern void *__vmalloc_node_range(unsigned long size, unsigned long align, ++extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); ++#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) ++ ++extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); ++#define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) ++ ++extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); ++#define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) ++ ++extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); ++#define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) ++ ++extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); ++#define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) ++ ++extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); ++#define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) ++ ++extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); ++#define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) ++ ++extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); ++#define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) ++ ++extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) __alloc_size(1); +-void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, ++#define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) ++ ++void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, + int node, const void *caller) __alloc_size(1); +-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); ++#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) ++ ++void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); ++#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) ++ ++extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); ++#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) ++ ++extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); ++#define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) ++ ++extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); ++#define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) + +-extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +-extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); +-extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +-extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2); ++extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); ++#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) + + extern void vfree(const void *addr); + extern void vfree_atomic(const void *addr); +diff --git a/init/Kconfig b/init/Kconfig +index b6d38eccc..cec6bac1a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -940,10 +940,14 @@ config CGROUP_FAVOR_DYNMODS + + Say N if unsure. + ++config SLAB_OBJ_EXT ++ bool ++ + config MEMCG + bool "Memory controller" + select PAGE_COUNTER + select EVENTFD ++ select SLAB_OBJ_EXT + help + Provides control over the memory footprint of tasks in a cgroup. - size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, - size_t bytes, struct iov_iter *i); -+size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, -+ size_t bytes, struct iov_iter *i); - void iov_iter_advance(struct iov_iter *i, size_t bytes); - void iov_iter_revert(struct iov_iter *i, size_t bytes); - size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bf..f703116e0 100644 --- a/init/init_task.c @@ -94059,6 +97242,22 @@ index 4198f0273..b2abd9a5d 100644 + +config SIXLOCKS + bool +diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c +index 9a4db5cce..fc42930af 100644 +--- a/kernel/dma/mapping.c ++++ b/kernel/dma/mapping.c +@@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, + size = PAGE_ALIGN(size); + if (dma_alloc_direct(dev, ops)) + return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); +- if (!ops->alloc_pages) ++ if (!ops->alloc_pages_op) + return NULL; +- return ops->alloc_pages(dev, size, dma_handle, dir, gfp); ++ return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); + } + + struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d1..a095dbbf0 100644 --- a/kernel/locking/Makefile @@ -94143,6 +97342,23 @@ index 4dfd2f3e0..0463302e2 100644 +} +EXPORT_SYMBOL_GPL(lockdep_set_no_check_recursion); +#endif +diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c +index d5610ad52..b752ec5cc 100644 +--- a/kernel/locking/osq_lock.c ++++ b/kernel/locking/osq_lock.c +@@ -203,6 +203,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) + + return false; + } ++EXPORT_SYMBOL_GPL(osq_lock); + + void osq_unlock(struct optimistic_spin_queue *lock) + { +@@ -230,3 +231,4 @@ void osq_unlock(struct optimistic_spin_queue *lock) + if (next) + WRITE_ONCE(next->locked, 1); + } ++EXPORT_SYMBOL_GPL(osq_unlock); diff --git a/kernel/locking/six.c b/kernel/locking/six.c new file mode 100644 index 000000000..0b9c4bb7c @@ -95042,6 +98258,102 @@ index 000000000..0b9c4bb7c +#endif +} +EXPORT_SYMBOL_GPL(__six_lock_init); +diff --git a/kernel/module/main.c b/kernel/module/main.c +index 4e2cf784c..7f7b5bedf 100644 +--- a/kernel/module/main.c ++++ b/kernel/module/main.c +@@ -56,6 +56,7 @@ + #include + #include + #include ++#include + #include + #include + #include "internal.h" +@@ -1217,15 +1218,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) + return module_alloc(size); + } + +-static void module_memory_free(void *ptr, enum mod_mem_type type) ++static void module_memory_free(void *ptr, enum mod_mem_type type, ++ bool unload_codetags) + { ++ if (!unload_codetags && mod_mem_type_is_core_data(type)) ++ return; ++ + if (mod_mem_use_vmalloc(type)) + vfree(ptr); + else + module_memfree(ptr); + } + +-static void free_mod_mem(struct module *mod) ++static void free_mod_mem(struct module *mod, bool unload_codetags) + { + for_each_mod_mem_type(type) { + struct module_memory *mod_mem = &mod->mem[type]; +@@ -1236,19 +1241,23 @@ static void free_mod_mem(struct module *mod) + /* Free lock-classes; relies on the preceding sync_rcu(). */ + lockdep_free_key_range(mod_mem->base, mod_mem->size); + if (mod_mem->size) +- module_memory_free(mod_mem->base, type); ++ module_memory_free(mod_mem->base, type, ++ unload_codetags); + } + + /* MOD_DATA hosts mod, so free it at last */ + lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); +- module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); ++ module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags); + } + + /* Free a module, remove from lists, etc. */ + static void free_module(struct module *mod) + { ++ bool unload_codetags; ++ + trace_module_free(mod); + ++ unload_codetags = codetag_unload_module(mod); + mod_sysfs_teardown(mod); + + /* +@@ -1290,7 +1299,7 @@ static void free_module(struct module *mod) + kfree(mod->args); + percpu_modfree(mod); + +- free_mod_mem(mod); ++ free_mod_mem(mod, unload_codetags); + } + + void *__symbol_get(const char *symbol) +@@ -2292,7 +2301,7 @@ static int move_module(struct module *mod, struct load_info *info) + return 0; + out_enomem: + for (t--; t >= 0; t--) +- module_memory_free(mod->mem[t].base, t); ++ module_memory_free(mod->mem[t].base, t, true); + return ret; + } + +@@ -2422,7 +2431,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) + percpu_modfree(mod); + module_arch_freeing_init(mod); + +- free_mod_mem(mod); ++ free_mod_mem(mod, true); + } + + int __weak module_finalize(const Elf_Ehdr *hdr, +@@ -2974,6 +2983,8 @@ static int load_module(struct load_info *info, const char __user *uargs, + /* Get rid of temporary copy. */ + free_copy(info, flags); + ++ codetag_load_module(mod); ++ + /* Done! */ + trace_module_load(mod); + diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9ed5ce989..4f6582487 100644 --- a/kernel/stacktrace.c @@ -95077,10 +98389,47 @@ index 5c2da561c..f78bc8b42 100644 bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index ce51d4dc6..a4e6ce3d2 100644 +index ce51d4dc6..a19ec6fd7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug -@@ -1637,6 +1637,15 @@ config DEBUG_NOTIFIERS +@@ -957,6 +957,36 @@ config DEBUG_STACKOVERFLOW + + If in doubt, say "N". + ++config CODE_TAGGING ++ bool ++ select KALLSYMS ++ ++config MEM_ALLOC_PROFILING ++ bool "Enable memory allocation profiling" ++ default n ++ depends on PROC_FS ++ select CODE_TAGGING ++ select PAGE_EXTENSION ++ select SLAB_OBJ_EXT ++ help ++ Track allocation source code and record total allocation size ++ initiated at that code location. The mechanism can be used to track ++ memory leaks with a low performance and memory impact. ++ ++config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT ++ bool "Enable memory allocation profiling by default" ++ default y ++ depends on MEM_ALLOC_PROFILING ++ ++config MEM_ALLOC_PROFILING_DEBUG ++ bool "Memory allocation profiler debugging" ++ default n ++ depends on MEM_ALLOC_PROFILING ++ select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT ++ help ++ Adds warnings with helpful error messages for memory allocation ++ profiling. ++ + source "lib/Kconfig.kasan" + source "lib/Kconfig.kfence" + source "lib/Kconfig.kmsan" +@@ -1637,6 +1667,15 @@ config DEBUG_NOTIFIERS This is a relatively cheap check but if you care about maximum performance, say N. @@ -95089,14 +98438,27 @@ index ce51d4dc6..a4e6ce3d2 100644 + depends on CLOSURES + select DEBUG_FS + help -+ Keeps all active closures in a linked list and provides a debugfs -+ interface to list them, which makes it possible to see asynchronous -+ operations that get stuck. ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. + config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" select DEBUG_LIST -@@ -2123,6 +2132,15 @@ config CPUMASK_KUNIT_TEST +@@ -1997,6 +2036,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER + help + Provide stacktrace filter for fault-injection capabilities + ++config CODETAG_FAULT_INJECTION ++ bool "Code tagging based fault injection" ++ select CODE_TAGGING ++ help ++ Dynamic fault injection based on code tagging ++ + config ARCH_HAS_KCOV + bool + help +@@ -2123,6 +2168,15 @@ config CPUMASK_KUNIT_TEST If unsure, say N. @@ -95113,7 +98475,7 @@ index ce51d4dc6..a4e6ce3d2 100644 tristate "Linked list sorting test" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile -index 876fcdeae..38fe45011 100644 +index 876fcdeae..fb1d20939 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,7 +30,7 @@ endif @@ -95125,7 +98487,19 @@ index 876fcdeae..38fe45011 100644 is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ -@@ -248,6 +248,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o +@@ -226,6 +226,11 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ + of-reconfig-notifier-error-inject.o + obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o + ++obj-$(CONFIG_CODE_TAGGING) += codetag.o ++obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o ++ ++obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o ++ + lib-$(CONFIG_GENERIC_BUG) += bug.o + + obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o +@@ -248,6 +253,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o @@ -95134,6 +98508,237 @@ index 876fcdeae..38fe45011 100644 obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o +diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c +new file mode 100644 +index 000000000..1ca90cff5 +--- /dev/null ++++ b/lib/alloc_tag.c +@@ -0,0 +1,225 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct codetag_type *alloc_tag_cttype; ++ ++DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, ++ mem_alloc_profiling_key); ++ ++static void *allocinfo_start(struct seq_file *m, loff_t *pos) ++{ ++ struct codetag_iterator *iter; ++ struct codetag *ct; ++ loff_t node = *pos; ++ ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); ++ m->private = iter; ++ if (!iter) ++ return NULL; ++ ++ codetag_lock_module_list(alloc_tag_cttype, true); ++ *iter = codetag_get_ct_iter(alloc_tag_cttype); ++ while ((ct = codetag_next_ct(iter)) != NULL && node) ++ node--; ++ ++ return ct ? iter : NULL; ++} ++ ++static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) ++{ ++ struct codetag_iterator *iter = (struct codetag_iterator *)arg; ++ struct codetag *ct = codetag_next_ct(iter); ++ ++ (*pos)++; ++ if (!ct) ++ return NULL; ++ ++ return iter; ++} ++ ++static void allocinfo_stop(struct seq_file *m, void *arg) ++{ ++ struct codetag_iterator *iter = (struct codetag_iterator *)m->private; ++ ++ if (iter) { ++ codetag_lock_module_list(alloc_tag_cttype, false); ++ kfree(iter); ++ } ++} ++ ++static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) ++{ ++ struct alloc_tag *tag = ct_to_alloc_tag(ct); ++ s64 bytes = alloc_tag_read(tag); ++ char val[10], *p = val; ++ ++ if (bytes < 0) { ++ *p++ = '-'; ++ bytes = -bytes; ++ } ++ ++ string_get_size(bytes, 1, ++ STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, ++ p, val + ARRAY_SIZE(val) - p); ++ ++ seq_buf_printf(out, "%8s ", val); ++ codetag_to_text(out, ct); ++ seq_buf_putc(out, ' '); ++ seq_buf_putc(out, '\n'); ++} ++ ++static int allocinfo_show(struct seq_file *m, void *arg) ++{ ++ struct codetag_iterator *iter = (struct codetag_iterator *)arg; ++ char *bufp; ++ size_t n = seq_get_buf(m, &bufp); ++ struct seq_buf buf; ++ ++ seq_buf_init(&buf, bufp, n); ++ alloc_tag_to_text(&buf, iter->ct); ++ seq_commit(m, seq_buf_used(&buf)); ++ return 0; ++} ++ ++static const struct seq_operations allocinfo_seq_op = { ++ .start = allocinfo_start, ++ .next = allocinfo_next, ++ .stop = allocinfo_stop, ++ .show = allocinfo_show, ++}; ++ ++void alloc_tags_show_mem_report(struct seq_buf *s) ++{ ++ struct codetag_iterator iter; ++ struct codetag *ct; ++ struct { ++ struct codetag *tag; ++ size_t bytes; ++ } tags[10], n; ++ unsigned int i, nr = 0; ++ ++ codetag_lock_module_list(alloc_tag_cttype, true); ++ iter = codetag_get_ct_iter(alloc_tag_cttype); ++ while ((ct = codetag_next_ct(&iter))) { ++ n.tag = ct; ++ n.bytes = alloc_tag_read(ct_to_alloc_tag(ct)); ++ ++ for (i = 0; i < nr; i++) ++ if (n.bytes > tags[i].bytes) ++ break; ++ ++ if (i < ARRAY_SIZE(tags)) { ++ nr -= nr == ARRAY_SIZE(tags); ++ memmove(&tags[i + 1], ++ &tags[i], ++ sizeof(tags[0]) * (nr - i)); ++ nr++; ++ tags[i] = n; ++ } ++ } ++ ++ for (i = 0; i < nr; i++) ++ alloc_tag_to_text(s, tags[i].tag); ++ ++ codetag_lock_module_list(alloc_tag_cttype, false); ++} ++ ++static void __init procfs_init(void) ++{ ++ proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); ++} ++ ++static void alloc_tag_module_load(struct codetag_type *cttype, struct codetag_module *cmod) ++{ ++ struct codetag_iterator iter = codetag_get_ct_iter(cttype); ++ struct codetag *ct; ++ ++ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { ++ if (iter.cmod != cmod) ++ continue; ++ ++ ct_to_alloc_tag(ct)->bytes_allocated = alloc_percpu(u64); ++ } ++} ++ ++static bool alloc_tag_module_unload(struct codetag_type *cttype, struct codetag_module *cmod) ++{ ++ struct codetag_iterator iter = codetag_get_ct_iter(cttype); ++ bool module_unused = true; ++ struct alloc_tag *tag; ++ struct codetag *ct; ++ size_t bytes; ++ ++ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { ++ if (iter.cmod != cmod) ++ continue; ++ ++ tag = ct_to_alloc_tag(ct); ++ bytes = alloc_tag_read(tag); ++ ++ if (!WARN(bytes, "%s:%u module %s func:%s has %zu allocated at module unload", ++ ct->filename, ct->lineno, ct->modname, ct->function, bytes)) ++ free_percpu(tag->bytes_allocated); ++ else ++ module_unused = false; ++ } ++ ++ return module_unused; ++} ++ ++static __init bool need_page_alloc_tagging(void) ++{ ++ return true; ++} ++ ++static __init void init_page_alloc_tagging(void) ++{ ++} ++ ++struct page_ext_operations page_alloc_tagging_ops = { ++ .size = sizeof(union codetag_ref), ++ .need = need_page_alloc_tagging, ++ .init = init_page_alloc_tagging, ++}; ++EXPORT_SYMBOL(page_alloc_tagging_ops); ++ ++static struct ctl_table memory_allocation_profiling_sysctls[] = { ++ { ++ .procname = "mem_profiling", ++ .data = &mem_alloc_profiling_key, ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ .mode = 0444, ++#else ++ .mode = 0644, ++#endif ++ .proc_handler = proc_do_static_key, ++ }, ++ { } ++}; ++ ++static int __init alloc_tag_init(void) ++{ ++ const struct codetag_type_desc desc = { ++ .section = "alloc_tags", ++ .tag_size = sizeof(struct alloc_tag), ++ .module_load = alloc_tag_module_load, ++ .module_unload = alloc_tag_module_unload, ++ }; ++ ++ alloc_tag_cttype = codetag_register_type(&desc); ++ if (IS_ERR_OR_NULL(alloc_tag_cttype)) ++ return PTR_ERR(alloc_tag_cttype); ++ ++ register_sysctl_init("vm", memory_allocation_profiling_sysctls); ++ procfs_init(); ++ ++ return 0; ++} ++module_init(alloc_tag_init); diff --git a/drivers/md/bcache/closure.c b/lib/closure.c similarity index 88% rename from drivers/md/bcache/closure.c @@ -95252,6 +98857,782 @@ index d8d9394a6..0855e698c 100644 -MODULE_AUTHOR("Kent Overstreet "); -MODULE_LICENSE("GPL"); +#endif +diff --git a/lib/codetag.c b/lib/codetag.c +new file mode 100644 +index 000000000..84f90f3b9 +--- /dev/null ++++ b/lib/codetag.c +@@ -0,0 +1,393 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++struct codetag_type { ++ struct list_head link; ++ unsigned int count; ++ struct idr mod_idr; ++ struct rw_semaphore mod_lock; /* protects mod_idr */ ++ struct codetag_type_desc desc; ++}; ++ ++static DEFINE_MUTEX(codetag_lock); ++static LIST_HEAD(codetag_types); ++ ++void codetag_lock_module_list(struct codetag_type *cttype, bool lock) ++{ ++ if (lock) ++ down_read(&cttype->mod_lock); ++ else ++ up_read(&cttype->mod_lock); ++} ++ ++struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype) ++{ ++ struct codetag_iterator iter = { ++ .cttype = cttype, ++ .cmod = NULL, ++ .mod_id = 0, ++ .ct = NULL, ++ }; ++ ++ return iter; ++} ++ ++static inline struct codetag *get_first_module_ct(struct codetag_module *cmod) ++{ ++ return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL; ++} ++ ++static inline ++struct codetag *get_next_module_ct(struct codetag_iterator *iter) ++{ ++ struct codetag *res = (struct codetag *) ++ ((char *)iter->ct + iter->cttype->desc.tag_size); ++ ++ return res < iter->cmod->range.stop ? res : NULL; ++} ++ ++struct codetag *codetag_next_ct(struct codetag_iterator *iter) ++{ ++ struct codetag_type *cttype = iter->cttype; ++ struct codetag_module *cmod; ++ struct codetag *ct; ++ ++ lockdep_assert_held(&cttype->mod_lock); ++ ++ if (unlikely(idr_is_empty(&cttype->mod_idr))) ++ return NULL; ++ ++ ct = NULL; ++ while (true) { ++ cmod = idr_find(&cttype->mod_idr, iter->mod_id); ++ ++ /* If module was removed move to the next one */ ++ if (!cmod) ++ cmod = idr_get_next_ul(&cttype->mod_idr, ++ &iter->mod_id); ++ ++ /* Exit if no more modules */ ++ if (!cmod) ++ break; ++ ++ if (cmod != iter->cmod) { ++ iter->cmod = cmod; ++ ct = get_first_module_ct(cmod); ++ } else ++ ct = get_next_module_ct(iter); ++ ++ if (ct) ++ break; ++ ++ iter->mod_id++; ++ } ++ ++ iter->ct = ct; ++ return ct; ++} ++ ++void codetag_to_text(struct seq_buf *out, struct codetag *ct) ++{ ++ seq_buf_printf(out, "%s:%u module:%s func:%s", ++ ct->filename, ct->lineno, ++ ct->modname, ct->function); ++} ++ ++static inline size_t range_size(const struct codetag_type *cttype, ++ const struct codetag_range *range) ++{ ++ return ((char *)range->stop - (char *)range->start) / ++ cttype->desc.tag_size; ++} ++ ++static void *get_symbol(struct module *mod, const char *prefix, const char *name) ++{ ++ char buf[64]; ++ void *ret; ++ int res; ++ ++ res = snprintf(buf, sizeof(buf), "%s%s", prefix, name); ++ if (WARN_ON(res < 1 || res > sizeof(buf))) ++ return NULL; ++ ++ preempt_disable(); ++ ret = mod ? ++ (void *)find_kallsyms_symbol_value(mod, buf) : ++ (void *)kallsyms_lookup_name(buf); ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static struct codetag_range get_section_range(struct module *mod, ++ const char *section) ++{ ++ return (struct codetag_range) { ++ get_symbol(mod, "__start_", section), ++ get_symbol(mod, "__stop_", section), ++ }; ++} ++ ++static int codetag_module_init(struct codetag_type *cttype, struct module *mod) ++{ ++ struct codetag_range range; ++ struct codetag_module *cmod; ++ int err; ++ ++ range = get_section_range(mod, cttype->desc.section); ++ if (!range.start || !range.stop) { ++ pr_warn("Failed to load code tags of type %s from the module %s\n", ++ cttype->desc.section, ++ mod ? mod->name : "(built-in)"); ++ return -EINVAL; ++ } ++ ++ /* Ignore empty ranges */ ++ if (range.start == range.stop) ++ return 0; ++ ++ BUG_ON(range.start > range.stop); ++ ++ cmod = kmalloc(sizeof(*cmod), GFP_KERNEL); ++ if (unlikely(!cmod)) ++ return -ENOMEM; ++ ++ cmod->mod = mod; ++ cmod->range = range; ++ ++ down_write(&cttype->mod_lock); ++ err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); ++ if (err >= 0) { ++ cttype->count += range_size(cttype, &range); ++ if (cttype->desc.module_load) ++ cttype->desc.module_load(cttype, cmod); ++ } ++ up_write(&cttype->mod_lock); ++ ++ if (err < 0) { ++ kfree(cmod); ++ return err; ++ } ++ ++ return 0; ++} ++ ++struct codetag_type * ++codetag_register_type(const struct codetag_type_desc *desc) ++{ ++ struct codetag_type *cttype; ++ int err; ++ ++ BUG_ON(desc->tag_size <= 0); ++ ++ cttype = kzalloc(sizeof(*cttype), GFP_KERNEL); ++ if (unlikely(!cttype)) ++ return ERR_PTR(-ENOMEM); ++ ++ cttype->desc = *desc; ++ idr_init(&cttype->mod_idr); ++ init_rwsem(&cttype->mod_lock); ++ ++ err = codetag_module_init(cttype, NULL); ++ if (unlikely(err)) { ++ kfree(cttype); ++ return ERR_PTR(err); ++ } ++ ++ mutex_lock(&codetag_lock); ++ list_add_tail(&cttype->link, &codetag_types); ++ mutex_unlock(&codetag_lock); ++ ++ return cttype; ++} ++ ++void codetag_load_module(struct module *mod) ++{ ++ struct codetag_type *cttype; ++ ++ if (!mod) ++ return; ++ ++ mutex_lock(&codetag_lock); ++ list_for_each_entry(cttype, &codetag_types, link) ++ codetag_module_init(cttype, mod); ++ mutex_unlock(&codetag_lock); ++} ++ ++bool codetag_unload_module(struct module *mod) ++{ ++ struct codetag_type *cttype; ++ bool unload_ok = true; ++ ++ if (!mod) ++ return true; ++ ++ mutex_lock(&codetag_lock); ++ list_for_each_entry(cttype, &codetag_types, link) { ++ struct codetag_module *found = NULL; ++ struct codetag_module *cmod; ++ unsigned long mod_id, tmp; ++ ++ down_write(&cttype->mod_lock); ++ idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) { ++ if (cmod->mod && cmod->mod == mod) { ++ found = cmod; ++ break; ++ } ++ } ++ if (found) { ++ if (cttype->desc.module_unload) ++ if (!cttype->desc.module_unload(cttype, cmod)) ++ unload_ok = false; ++ ++ cttype->count -= range_size(cttype, &cmod->range); ++ idr_remove(&cttype->mod_idr, mod_id); ++ kfree(cmod); ++ } ++ up_write(&cttype->mod_lock); ++ } ++ mutex_unlock(&codetag_lock); ++ ++ return unload_ok; ++} ++ ++/* Codetag query parsing */ ++ ++#define CODETAG_QUERY_TOKENS() \ ++ x(func) \ ++ x(file) \ ++ x(line) \ ++ x(module) \ ++ x(class) \ ++ x(index) ++ ++enum tokens { ++#define x(name) TOK_##name, ++ CODETAG_QUERY_TOKENS() ++#undef x ++}; ++ ++static const char * const token_strs[] = { ++#define x(name) #name, ++ CODETAG_QUERY_TOKENS() ++#undef x ++ NULL ++}; ++ ++static int parse_range(char *str, unsigned int *first, unsigned int *last) ++{ ++ char *first_str = str; ++ char *last_str = strchr(first_str, '-'); ++ ++ if (last_str) ++ *last_str++ = '\0'; ++ ++ if (kstrtouint(first_str, 10, first)) ++ return -EINVAL; ++ ++ if (!last_str) ++ *last = *first; ++ else if (kstrtouint(last_str, 10, last)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++char *codetag_query_parse(struct codetag_query *q, char *buf) ++{ ++ while (1) { ++ char *p = buf; ++ char *str1 = strsep_no_empty(&p, " \t\r\n"); ++ char *str2 = strsep_no_empty(&p, " \t\r\n"); ++ int ret, token; ++ ++ if (!str1 || !str2) ++ break; ++ ++ token = match_string(token_strs, ARRAY_SIZE(token_strs), str1); ++ if (token < 0) ++ break; ++ ++ switch (token) { ++ case TOK_func: ++ q->function = str2; ++ break; ++ case TOK_file: ++ q->filename = str2; ++ break; ++ case TOK_line: ++ ret = parse_range(str2, &q->first_line, &q->last_line); ++ if (ret) ++ return ERR_PTR(ret); ++ q->match_line = true; ++ break; ++ case TOK_module: ++ q->module = str2; ++ break; ++ case TOK_class: ++ q->class = str2; ++ break; ++ case TOK_index: ++ ret = parse_range(str2, &q->first_index, &q->last_index); ++ if (ret) ++ return ERR_PTR(ret); ++ q->match_index = true; ++ break; ++ } ++ ++ buf = p; ++ } ++ ++ return buf; ++} ++ ++bool codetag_matches_query(struct codetag_query *q, ++ const struct codetag *ct, ++ const struct codetag_module *mod, ++ const char *class) ++{ ++ size_t classlen = q->class ? strlen(q->class) : 0; ++ ++ if (q->module && ++ (!mod->mod || ++ strcmp(q->module, ct->modname))) ++ return false; ++ ++ if (q->filename && ++ strcmp(q->filename, ct->filename) && ++ strcmp(q->filename, kbasename(ct->filename))) ++ return false; ++ ++ if (q->function && ++ strcmp(q->function, ct->function)) ++ return false; ++ ++ /* match against the line number range */ ++ if (q->match_line && ++ (ct->lineno < q->first_line || ++ ct->lineno > q->last_line)) ++ return false; ++ ++ /* match against the class */ ++ if (classlen && ++ (strncmp(q->class, class, classlen) || ++ (class[classlen] && class[classlen] != ':'))) ++ return false; ++ ++ /* match against the fault index */ ++ if (q->match_index && ++ (q->cur_index < q->first_index || ++ q->cur_index > q->last_index)) { ++ q->cur_index++; ++ return false; ++ } ++ ++ q->cur_index++; ++ return true; ++} +diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c +new file mode 100644 +index 000000000..c92374359 +--- /dev/null ++++ b/lib/dynamic_fault.c +@@ -0,0 +1,371 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct codetag_type *cttype; ++ ++bool __dynamic_fault_enabled(struct dfault *df) ++{ ++ union dfault_state old, new; ++ unsigned int v = df->state.v; ++ bool ret; ++ ++ do { ++ old.v = new.v = v; ++ ++ if (new.enabled == DFAULT_disabled) ++ return false; ++ ++ ret = df->frequency ++ ? ++new.count >= df->frequency ++ : true; ++ if (ret) ++ new.count = 0; ++ if (ret && new.enabled == DFAULT_oneshot) ++ new.enabled = DFAULT_disabled; ++ } while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v); ++ ++ if (ret) ++ pr_debug("returned true for %s:%u", df->tag.filename, df->tag.lineno); ++ ++ return ret; ++} ++EXPORT_SYMBOL(__dynamic_fault_enabled); ++ ++static const char * const dfault_state_strs[] = { ++#define x(n) #n, ++ DFAULT_STATES() ++#undef x ++ NULL ++}; ++ ++static void dynamic_fault_to_text(struct seq_buf *out, struct dfault *df) ++{ ++ codetag_to_text(out, &df->tag); ++ seq_buf_printf(out, "class:%s %s \"", df->class, ++ dfault_state_strs[df->state.enabled]); ++} ++ ++struct dfault_query { ++ struct codetag_query q; ++ ++ bool set_enabled:1; ++ unsigned int enabled:2; ++ ++ bool set_frequency:1; ++ unsigned int frequency; ++}; ++ ++/* ++ * Search the tables for _dfault's which match the given ++ * `query' and apply the `flags' and `mask' to them. Tells ++ * the user which dfault's were changed, or whether none ++ * were matched. ++ */ ++static int dfault_change(struct dfault_query *query) ++{ ++ struct codetag_iterator ct_iter = codetag_get_ct_iter(cttype); ++ struct codetag *ct; ++ unsigned int nfound = 0; ++ ++ codetag_lock_module_list(cttype, true); ++ ++ while ((ct = codetag_next_ct(&ct_iter))) { ++ struct dfault *df = container_of(ct, struct dfault, tag); ++ ++ if (!codetag_matches_query(&query->q, ct, ct_iter.cmod, df->class)) ++ continue; ++ ++ if (query->set_enabled && ++ query->enabled != df->state.enabled) { ++ if (query->enabled != DFAULT_disabled) ++ static_key_slow_inc(&df->enabled.key); ++ else if (df->state.enabled != DFAULT_disabled) ++ static_key_slow_dec(&df->enabled.key); ++ ++ df->state.enabled = query->enabled; ++ } ++ ++ if (query->set_frequency) ++ df->frequency = query->frequency; ++ ++ pr_debug("changed %s:%d [%s]%s #%d %s", ++ df->tag.filename, df->tag.lineno, df->tag.modname, ++ df->tag.function, query->q.cur_index, ++ dfault_state_strs[df->state.enabled]); ++ ++ nfound++; ++ } ++ ++ pr_debug("dfault: %u matches", nfound); ++ ++ codetag_lock_module_list(cttype, false); ++ ++ return nfound ? 0 : -ENOENT; ++} ++ ++#define DFAULT_TOKENS() \ ++ x(disable, 0) \ ++ x(enable, 0) \ ++ x(oneshot, 0) \ ++ x(frequency, 1) ++ ++enum dfault_token { ++#define x(name, nr_args) TOK_##name, ++ DFAULT_TOKENS() ++#undef x ++}; ++ ++static const char * const dfault_token_strs[] = { ++#define x(name, nr_args) #name, ++ DFAULT_TOKENS() ++#undef x ++ NULL ++}; ++ ++static unsigned int dfault_token_nr_args[] = { ++#define x(name, nr_args) nr_args, ++ DFAULT_TOKENS() ++#undef x ++}; ++ ++static enum dfault_token str_to_token(const char *word, unsigned int nr_words) ++{ ++ int tok = match_string(dfault_token_strs, ARRAY_SIZE(dfault_token_strs), word); ++ ++ if (tok < 0) { ++ pr_debug("unknown keyword \"%s\"", word); ++ return tok; ++ } ++ ++ if (nr_words < dfault_token_nr_args[tok]) { ++ pr_debug("insufficient arguments to \"%s\"", word); ++ return -EINVAL; ++ } ++ ++ return tok; ++} ++ ++static int dfault_parse_command(struct dfault_query *query, ++ enum dfault_token tok, ++ char *words[], size_t nr_words) ++{ ++ unsigned int i = 0; ++ int ret; ++ ++ switch (tok) { ++ case TOK_disable: ++ query->set_enabled = true; ++ query->enabled = DFAULT_disabled; ++ break; ++ case TOK_enable: ++ query->set_enabled = true; ++ query->enabled = DFAULT_enabled; ++ break; ++ case TOK_oneshot: ++ query->set_enabled = true; ++ query->enabled = DFAULT_oneshot; ++ break; ++ case TOK_frequency: ++ query->set_frequency = 1; ++ ret = kstrtouint(words[i++], 10, &query->frequency); ++ if (ret) ++ return ret; ++ ++ if (!query->set_enabled) { ++ query->set_enabled = 1; ++ query->enabled = DFAULT_enabled; ++ } ++ break; ++ } ++ ++ return i; ++} ++ ++static int dynamic_fault_store(char *buf) ++{ ++ struct dfault_query query = { NULL }; ++#define MAXWORDS 9 ++ char *tok, *words[MAXWORDS]; ++ int ret, nr_words, i = 0; ++ ++ buf = codetag_query_parse(&query.q, buf); ++ if (IS_ERR(buf)) ++ return PTR_ERR(buf); ++ ++ while ((tok = strsep_no_empty(&buf, " \t\r\n"))) { ++ if (nr_words == ARRAY_SIZE(words)) ++ return -EINVAL; /* ran out of words[] before bytes */ ++ words[nr_words++] = tok; ++ } ++ ++ while (i < nr_words) { ++ const char *tok_str = words[i++]; ++ enum dfault_token tok = str_to_token(tok_str, nr_words - i); ++ ++ if (tok < 0) ++ return tok; ++ ++ ret = dfault_parse_command(&query, tok, words + i, nr_words - i); ++ if (ret < 0) ++ return ret; ++ ++ i += ret; ++ BUG_ON(i > nr_words); ++ } ++ ++ pr_debug("q->function=\"%s\" q->filename=\"%s\" " ++ "q->module=\"%s\" q->line=%u-%u\n q->index=%u-%u", ++ query.q.function, query.q.filename, query.q.module, ++ query.q.first_line, query.q.last_line, ++ query.q.first_index, query.q.last_index); ++ ++ ret = dfault_change(&query); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++struct dfault_iter { ++ struct codetag_iterator ct_iter; ++ ++ struct seq_buf buf; ++ char rawbuf[4096]; ++}; ++ ++static int dfault_open(struct inode *inode, struct file *file) ++{ ++ struct dfault_iter *iter; ++ ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); ++ if (!iter) ++ return -ENOMEM; ++ ++ codetag_lock_module_list(cttype, true); ++ iter->ct_iter = codetag_get_ct_iter(cttype); ++ codetag_lock_module_list(cttype, false); ++ ++ file->private_data = iter; ++ seq_buf_init(&iter->buf, iter->rawbuf, sizeof(iter->rawbuf)); ++ return 0; ++} ++ ++static int dfault_release(struct inode *inode, struct file *file) ++{ ++ struct dfault_iter *iter = file->private_data; ++ ++ kfree(iter); ++ return 0; ++} ++ ++struct user_buf { ++ char __user *buf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_ubuf(struct user_buf *dst, struct seq_buf *src) ++{ ++ if (src->len) { ++ size_t bytes = min_t(size_t, src->len, dst->size); ++ int err = copy_to_user(dst->buf, src->buffer, bytes); ++ ++ if (err) ++ return err; ++ ++ dst->ret += bytes; ++ dst->buf += bytes; ++ dst->size -= bytes; ++ src->len -= bytes; ++ memmove(src->buffer, src->buffer + bytes, src->len); ++ } ++ ++ return 0; ++} ++ ++static ssize_t dfault_read(struct file *file, char __user *ubuf, ++ size_t size, loff_t *ppos) ++{ ++ struct dfault_iter *iter = file->private_data; ++ struct user_buf buf = { .buf = ubuf, .size = size }; ++ struct codetag *ct; ++ struct dfault *df; ++ int err; ++ ++ codetag_lock_module_list(iter->ct_iter.cttype, true); ++ while (1) { ++ err = flush_ubuf(&buf, &iter->buf); ++ if (err || !buf.size) ++ break; ++ ++ ct = codetag_next_ct(&iter->ct_iter); ++ if (!ct) ++ break; ++ ++ df = container_of(ct, struct dfault, tag); ++ dynamic_fault_to_text(&iter->buf, df); ++ seq_buf_putc(&iter->buf, '\n'); ++ } ++ codetag_lock_module_list(iter->ct_iter.cttype, false); ++ ++ return err ?: buf.ret; ++} ++ ++/* ++ * File_ops->write method for /dynamic_fault/conrol. Gathers the ++ * command text from userspace, parses and executes it. ++ */ ++static ssize_t dfault_write(struct file *file, const char __user *ubuf, ++ size_t len, loff_t *offp) ++{ ++ char tmpbuf[256]; ++ ++ if (len == 0) ++ return 0; ++ /* we don't check *offp -- multiple writes() are allowed */ ++ if (len > sizeof(tmpbuf)-1) ++ return -E2BIG; ++ if (copy_from_user(tmpbuf, ubuf, len)) ++ return -EFAULT; ++ tmpbuf[len] = '\0'; ++ pr_debug("read %zu bytes from userspace", len); ++ ++ dynamic_fault_store(tmpbuf); ++ ++ *offp += len; ++ return len; ++} ++ ++static const struct file_operations dfault_ops = { ++ .owner = THIS_MODULE, ++ .open = dfault_open, ++ .release = dfault_release, ++ .read = dfault_read, ++ .write = dfault_write ++}; ++ ++static int __init dynamic_fault_init(void) ++{ ++ const struct codetag_type_desc desc = { ++ .section = "dynamic_fault_tags", ++ .tag_size = sizeof(struct dfault), ++ }; ++ struct dentry *debugfs_file; ++ ++ cttype = codetag_register_type(&desc); ++ if (IS_ERR_OR_NULL(cttype)) ++ return PTR_ERR(cttype); ++ ++ debugfs_file = debugfs_create_file("dynamic_faults", 0666, NULL, NULL, &dfault_ops); ++ if (IS_ERR(debugfs_file)) ++ return PTR_ERR(debugfs_file); ++ ++ return 0; ++} ++module_init(dynamic_fault_init); diff --git a/lib/errname.c b/lib/errname.c index 67739b174..dd1b99855 100644 --- a/lib/errname.c @@ -95369,80 +99750,62 @@ index f25eb111c..41f1bcdc4 100644 { if (level) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c -index 960223ed9..65913b32e 100644 +index 960223ed9..f9c4bba27 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c -@@ -857,18 +857,10 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) +@@ -857,24 +857,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_zero); -size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, - struct iov_iter *i) -+static inline size_t __copy_page_from_iter_atomic(struct page *page, unsigned offset, -+ size_t bytes, struct iov_iter *i) ++size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, ++ size_t bytes, struct iov_iter *i) { - char *kaddr = kmap_atomic(page), *p = kaddr + offset; +- char *kaddr = kmap_atomic(page), *p = kaddr + offset; - if (!page_copy_sane(page, offset, bytes)) { - kunmap_atomic(kaddr); -- return 0; ++ size_t n, copied = 0; ++ ++ if (!page_copy_sane(page, offset, bytes)) + return 0; - } - if (WARN_ON_ONCE(!i->data_source)) { - kunmap_atomic(kaddr); -- return 0; -- } - iterate_and_advance(i, bytes, base, len, off, - copyin(p + off, base, len), - memcpy_from_iter(i, p + off, base, len) -@@ -876,8 +868,49 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt - kunmap_atomic(kaddr); - return bytes; - } -+ -+size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, -+ struct iov_iter *i) -+{ -+ if (!page_copy_sane(page, offset, bytes)) -+ return 0; + if (WARN_ON_ONCE(!i->data_source)) -+ return 0; -+ return __copy_page_from_iter_atomic(page, offset, bytes, i); -+} + return 0; +- } +- iterate_and_advance(i, bytes, base, len, off, +- copyin(p + off, base, len), +- memcpy_from_iter(i, p + off, base, len) +- ) +- kunmap_atomic(kaddr); +- return bytes; ++ ++ do { ++ char *p; ++ ++ n = bytes - copied; ++ if (PageHighMem(page)) { ++ page += offset / PAGE_SIZE; ++ offset %= PAGE_SIZE; ++ n = min_t(size_t, n, PAGE_SIZE - offset); ++ } ++ ++ p = kmap_atomic(page) + offset; ++ iterate_and_advance(i, n, base, len, off, ++ copyin(p + off, base, len), ++ memcpy_from_iter(i, p + off, base, len) ++ ) ++ kunmap_atomic(p); ++ copied += n; ++ offset += n; ++ } while (PageHighMem(page) && copied != bytes && n > 0); ++ ++ return copied; + } EXPORT_SYMBOL(copy_page_from_iter_atomic); -+size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, -+ size_t bytes, struct iov_iter *i) -+{ -+ size_t ret = 0; -+ -+ if (WARN_ON(offset + bytes > folio_size(folio))) -+ return 0; -+ if (WARN_ON_ONCE(!i->data_source)) -+ return 0; -+ -+#ifdef CONFIG_HIGHMEM -+ while (bytes) { -+ struct page *page = folio_page(folio, offset >> PAGE_SHIFT); -+ unsigned b = min(bytes, PAGE_SIZE - (offset & PAGE_MASK)); -+ unsigned r = __copy_page_from_iter_atomic(page, offset, b, i); -+ -+ offset += r; -+ bytes -= r; -+ ret += r; -+ -+ if (r != b) -+ break; -+ } -+#else -+ ret = __copy_page_from_iter_atomic(&folio->page, offset, bytes, i); -+#endif -+ -+ return ret; -+} -+EXPORT_SYMBOL(copy_folio_from_iter_atomic); -+ - static void pipe_advance(struct iov_iter *i, size_t size) - { - struct pipe_inode_info *pipe = i->pipe; diff --git a/lib/math/Kconfig b/lib/math/Kconfig index 0634b428d..7530ae9a3 100644 --- a/lib/math/Kconfig @@ -95877,10 +100240,49 @@ index 000000000..f45591a16 +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/lib/rhashtable.c b/lib/rhashtable.c -index 6ae2ba8e0..d3fce9c89 100644 +index 6ae2ba8e0..76e5bf9be 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c -@@ -360,9 +360,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, +@@ -130,7 +130,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht, + if (ntbl) + return ntbl; + +- ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); ++ ntbl = kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO); + + if (ntbl && leaf) { + for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) +@@ -157,7 +157,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, + + size = sizeof(*tbl) + sizeof(tbl->buckets[0]); + +- tbl = kzalloc(size, gfp); ++ tbl = kmalloc_noprof(size, gfp|__GFP_ZERO); + if (!tbl) + return NULL; + +@@ -180,8 +180,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + size_t size; + int i; + static struct lock_class_key __key; ++ struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); + +- tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); ++ tbl = kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), ++ gfp|__GFP_ZERO, NUMA_NO_NODE); + + size = nbuckets; + +@@ -190,6 +192,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, + nbuckets = 0; + } + ++ alloc_tag_restore(ht->alloc_tag, old); ++ + if (tbl == NULL) + return NULL; + +@@ -360,9 +364,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, ASSERT_RHT_MUTEX(ht); @@ -95897,6 +100299,85 @@ index 6ae2ba8e0..d3fce9c89 100644 err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) +@@ -975,7 +984,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) + } + + /** +- * rhashtable_init - initialize a new hash table ++ * rhashtable_init_noprof - initialize a new hash table + * @ht: hash table to be initialized + * @params: configuration parameters + * +@@ -1016,7 +1025,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) + * .obj_hashfn = my_hash_fn, + * }; + */ +-int rhashtable_init(struct rhashtable *ht, ++int rhashtable_init_noprof(struct rhashtable *ht, + const struct rhashtable_params *params) + { + struct bucket_table *tbl; +@@ -1031,6 +1040,10 @@ int rhashtable_init(struct rhashtable *ht, + spin_lock_init(&ht->lock); + memcpy(&ht->p, params, sizeof(*params)); + ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ ht->alloc_tag = current->alloc_tag; ++#endif ++ + if (params->min_size) + ht->p.min_size = roundup_pow_of_two(params->min_size); + +@@ -1076,26 +1089,26 @@ int rhashtable_init(struct rhashtable *ht, + + return 0; + } +-EXPORT_SYMBOL_GPL(rhashtable_init); ++EXPORT_SYMBOL_GPL(rhashtable_init_noprof); + + /** +- * rhltable_init - initialize a new hash list table ++ * rhltable_init_noprof - initialize a new hash list table + * @hlt: hash list table to be initialized + * @params: configuration parameters + * + * Initializes a new hash list table. + * +- * See documentation for rhashtable_init. ++ * See documentation for rhashtable_init_noprof. + */ +-int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) ++int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) + { + int err; + +- err = rhashtable_init(&hlt->ht, params); ++ err = rhashtable_init_noprof(&hlt->ht, params); + hlt->ht.rhlist = true; + return err; + } +-EXPORT_SYMBOL_GPL(rhltable_init); ++EXPORT_SYMBOL_GPL(rhltable_init_noprof); + + static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, + void (*free_fn)(void *ptr, void *arg), +@@ -1222,6 +1235,7 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( + unsigned int index = hash & ((1 << tbl->nest) - 1); + unsigned int size = tbl->size >> tbl->nest; + union nested_table *ntbl; ++ struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); + + ntbl = nested_table_top(tbl); + hash >>= tbl->nest; +@@ -1236,6 +1250,8 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( + size <= (1 << shift)); + } + ++ alloc_tag_restore(ht->alloc_tag, old); ++ + if (!ntbl) + return NULL; + diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 45c450f42..2b87e9219 100644 --- a/lib/seq_buf.c @@ -95915,32 +100396,111 @@ index 45c450f42..2b87e9219 100644 + seq_buf_commit(s, wrote); +} +EXPORT_SYMBOL(seq_buf_human_readable_u64); +diff --git a/lib/string.c b/lib/string.c +index 3d55ef890..dd4914baf 100644 +--- a/lib/string.c ++++ b/lib/string.c +@@ -520,6 +520,25 @@ char *strsep(char **s, const char *ct) + EXPORT_SYMBOL(strsep); + #endif + ++/** ++ * strsep_no_empt - Split a string into tokens, but don't return empty tokens ++ * @s: The string to be searched ++ * @ct: The characters to search for ++ * ++ * strsep() updates @s to point after the token, ready for the next call. ++ */ ++char *strsep_no_empty(char **s, const char *ct) ++{ ++ char *ret; ++ ++ do { ++ ret = strsep(s, ct); ++ } while (ret && !*ret); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(strsep_no_empty); ++ + #ifndef __HAVE_ARCH_MEMSET + /** + * memset - Fill a region of memory with the given value diff --git a/lib/string_helpers.c b/lib/string_helpers.c -index 230020a2e..ca36ceba0 100644 +index 230020a2e..d527ce455 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c -@@ -32,8 +32,8 @@ +@@ -19,11 +19,17 @@ + #include + #include + ++enum string_size_units { ++ STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ ++ STRING_UNITS_2, /* use binary powers of 2^10 */ ++}; ++ + /** + * string_get_size - get the size in the specified units + * @size: The size to be converted in blocks + * @blk_size: Size of the block (use 1 for size in bytes) +- * @units: units to use (powers of 1000 or 1024) ++ * @flags: units to use (powers of 1000 or 1024), whether to include space ++ * separator + * @buf: buffer to format to + * @len: length of buffer + * +@@ -31,15 +37,19 @@ + * giving the size in the required units. @buf should have room for * at least 9 bytes and will always be zero terminated. * ++ * Return value: number of characters of output that would have been written ++ * (which may be greater than len, if output was truncated). */ -void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, - char *buf, int len) -+int string_get_size(u64 size, u64 blk_size, const enum string_size_units units, ++int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, + char *buf, int len) { ++ enum string_size_units units = flags & flags & STRING_SIZE_BASE2 ++ ? STRING_UNITS_2 : STRING_UNITS_10; static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" -@@ -126,8 +126,8 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, +- "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" ++ "", "k", "M", "G", "T", "P", "E", "Z", "Y" + }; + static const char *const units_2[] = { +- "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" ++ "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, +@@ -126,8 +136,10 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, else unit = units_str[units][i]; - snprintf(buf, len, "%u%s %s", (u32)size, - tmp, unit); -+ return snprintf(buf, len, "%u%s %s", (u32)size, -+ tmp, unit); ++ return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp, ++ (flags & STRING_SIZE_NOSPACE) ? "" : " ", ++ unit, ++ (flags & STRING_SIZE_NOBYTES) ? "" : "B"); } EXPORT_SYMBOL(string_get_size); +diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c +index 9a68849a5..0b01ffca9 100644 +--- a/lib/test-string_helpers.c ++++ b/lib/test-string_helpers.c +@@ -507,8 +507,8 @@ static __init void __test_string_get_size(const u64 size, const u64 blk_size, + char buf10[string_get_size_maxbuf]; + char buf2[string_get_size_maxbuf]; + +- string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10)); +- string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2)); ++ string_get_size(size, blk_size, 0, buf10, sizeof(buf10)); ++ string_get_size(size, blk_size, STRING_SIZE_BASE2, buf2, sizeof(buf2)); + + test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10, + size, blk_size); diff --git a/mm/Makefile b/mm/Makefile index e29afc890..e2ecfe0ea 100644 --- a/mm/Makefile @@ -95954,6 +100514,616 @@ index e29afc890..e2ecfe0ea 100644 # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o +diff --git a/mm/compaction.c b/mm/compaction.c +index c8bcdea15..09dd56a94 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -1684,8 +1684,8 @@ static void isolate_freepages(struct compact_control *cc) + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +-static struct page *compaction_alloc(struct page *migratepage, +- unsigned long data) ++static struct page *compaction_alloc_noprof(struct page *migratepage, ++ unsigned long data) + { + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; +@@ -1704,6 +1704,12 @@ static struct page *compaction_alloc(struct page *migratepage, + return freepage; + } + ++static struct page *compaction_alloc(struct page *migratepage, ++ unsigned long data) ++{ ++ return alloc_hooks(compaction_alloc_noprof(migratepage, data)); ++} ++ + /* + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no +diff --git a/mm/filemap.c b/mm/filemap.c +index 83dda76d1..e5c81c0cf 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -958,7 +958,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, + EXPORT_SYMBOL_GPL(filemap_add_folio); + + #ifdef CONFIG_NUMA +-struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) ++struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) + { + int n; + struct folio *folio; +@@ -973,9 +973,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) + + return folio; + } +- return folio_alloc(gfp, order); ++ return folio_alloc_noprof(gfp, order); + } +-EXPORT_SYMBOL(filemap_alloc_folio); ++EXPORT_SYMBOL(filemap_alloc_folio_noprof); + #endif + + /* +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 624671aaa..221cce005 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -37,6 +37,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -2557,6 +2558,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, + /* Caller disabled irqs, so they are still disabled here */ + + split_page_owner(head, nr); ++ pgalloc_tag_split(head, nr); + + /* See comment in __split_huge_page_tail() */ + if (PageAnon(head)) { +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index f791076da..3e5a604ee 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -3246,7 +3246,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) + if (i == h->max_huge_pages_node[nid]) + return; + +- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); ++ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", + h->max_huge_pages_node[nid], buf, nid, i); + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); +@@ -3308,7 +3308,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) + if (i < h->max_huge_pages) { + char buf[32]; + +- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); ++ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); + pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", + h->max_huge_pages, buf, i); + h->max_huge_pages = i; +@@ -3354,7 +3354,7 @@ static void __init report_hugepages(void) + for_each_hstate(h) { + char buf[32]; + +- string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); ++ string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); + pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", + buf, h->free_huge_pages); + pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", +@@ -4245,7 +4245,7 @@ static int __init hugetlb_init(void) + char buf[32]; + + string_get_size(huge_page_size(&default_hstate), +- 1, STRING_UNITS_2, buf, 32); ++ 1, STRING_SIZE_BASE2, buf, 32); + pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", + default_hstate.max_huge_pages, buf); + pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", +diff --git a/mm/kfence/core.c b/mm/kfence/core.c +index dad3c0eb7..aea6fa145 100644 +--- a/mm/kfence/core.c ++++ b/mm/kfence/core.c +@@ -590,9 +590,9 @@ static unsigned long kfence_init_pool(void) + continue; + + __folio_set_slab(slab_folio(slab)); +-#ifdef CONFIG_MEMCG +- slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | +- MEMCG_DATA_OBJCGS; ++#ifdef CONFIG_MEMCG_KMEM ++ slab->obj_exts = (unsigned long)&kfence_metadata[i / 2 - 1].obj_exts | ++ MEMCG_DATA_OBJEXTS; + #endif + } + +@@ -634,8 +634,8 @@ static unsigned long kfence_init_pool(void) + + if (!i || (i % 2)) + continue; +-#ifdef CONFIG_MEMCG +- slab->memcg_data = 0; ++#ifdef CONFIG_MEMCG_KMEM ++ slab->obj_exts = 0; + #endif + __folio_clear_slab(slab_folio(slab)); + } +@@ -1093,8 +1093,8 @@ void __kfence_free(void *addr) + { + struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); + +-#ifdef CONFIG_MEMCG +- KFENCE_WARN_ON(meta->objcg); ++#ifdef CONFIG_MEMCG_KMEM ++ KFENCE_WARN_ON(meta->obj_exts.objcg); + #endif + /* + * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing +diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h +index 392fb273e..b02d2cb96 100644 +--- a/mm/kfence/kfence.h ++++ b/mm/kfence/kfence.h +@@ -97,8 +97,8 @@ struct kfence_metadata { + struct kfence_track free_track; + /* For updating alloc_covered on frees. */ + u32 alloc_stack_hash; +-#ifdef CONFIG_MEMCG +- struct obj_cgroup *objcg; ++#ifdef CONFIG_MEMCG_KMEM ++ struct slabobj_ext obj_exts; + #endif + }; + +diff --git a/mm/madvise.c b/mm/madvise.c +index b5ffbaf61..e08639a7c 100644 +--- a/mm/madvise.c ++++ b/mm/madvise.c +@@ -1311,6 +1311,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + madvise_vma_anon_name); + } + #endif /* CONFIG_ANON_VMA_NAME */ ++ ++static noinline unsigned long test_alloc(unsigned long in1, unsigned long in2, size_t size) ++{ ++ switch (in1) ++ { ++ case (1): ++ return __get_free_pages(GFP_KERNEL, 0); ++ case (2): ++ return (unsigned long)kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT); ++ default: ++ printk("test_alloc invoked with args in1=%lu in2=%lu\n", ++ in1, in2); ++ return 0; ++ } ++} ++ ++static noinline void test_free(unsigned long in1, unsigned long in2, unsigned long addr) ++{ ++ switch (in1) ++ { ++ case (1): ++ free_page(addr); ++ break; ++ case (2): ++ kfree((void*)addr); ++ break; ++ default: ++ printk("test_free invoked with args in1=%lu in2=%lu\n", ++ in1, in2); ++ break; ++ } ++} ++ ++#define MADV_TEST 25 ++static noinline int alloc_bench(unsigned long in1, unsigned long in2) ++{ ++ int i, batch, iter; ++ unsigned long addr[10]; ++ ++ for (iter = 0; iter < 1000000; iter++) { ++ size_t size = 8; ++ for (batch = 0; batch < 30; batch++) { ++ for (i = 0; i < 10; i++) { ++ addr[i] = test_alloc(in1, in2, size); ++ } ++ for (i = 0; i < 10; i++) { ++ test_free(in1, in2, addr[i]); ++ } ++ size += 8; ++ } ++ if (fatal_signal_pending(current)) ++ return -EINTR; ++ //cond_resched(); ++ } ++ ++ return 0; ++} ++ + /* + * The madvise(2) system call. + * +@@ -1390,6 +1448,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh + size_t len; + struct blk_plug plug; + ++ if (behavior == MADV_TEST) ++ return alloc_bench(start, len_in); ++ + if (!madvise_behavior_valid(behavior)) + return -EINVAL; + +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index 4b27e245a..f2a7fe718 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2892,13 +2892,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) + } + + #ifdef CONFIG_MEMCG_KMEM +-/* +- * The allocated objcg pointers array is not accounted directly. +- * Moreover, it should not come from DMA buffer and is not readily +- * reclaimable. So those GFP bits should be masked off. +- */ +-#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) +- + /* + * mod_objcg_mlstate() may be called with irq enabled, so + * mod_memcg_lruvec_state() should be used. +@@ -2917,62 +2910,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, + rcu_read_unlock(); + } + +-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, +- gfp_t gfp, bool new_slab) +-{ +- unsigned int objects = objs_per_slab(s, slab); +- unsigned long memcg_data; +- void *vec; +- +- gfp &= ~OBJCGS_CLEAR_MASK; +- vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, +- slab_nid(slab)); +- if (!vec) +- return -ENOMEM; +- +- memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; +- if (new_slab) { +- /* +- * If the slab is brand new and nobody can yet access its +- * memcg_data, no synchronization is required and memcg_data can +- * be simply assigned. +- */ +- slab->memcg_data = memcg_data; +- } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { +- /* +- * If the slab is already in use, somebody can allocate and +- * assign obj_cgroups in parallel. In this case the existing +- * objcg vector should be reused. +- */ +- kfree(vec); +- return 0; +- } +- +- kmemleak_not_leak(vec); +- return 0; +-} +- + static __always_inline + struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) + { + /* + * Slab objects are accounted individually, not per-page. + * Memcg membership data for each individual object is saved in +- * slab->memcg_data. ++ * slab->obj_exts. + */ + if (folio_test_slab(folio)) { +- struct obj_cgroup **objcgs; ++ struct slabobj_ext *obj_exts; + struct slab *slab; + unsigned int off; + + slab = folio_slab(folio); +- objcgs = slab_objcgs(slab); +- if (!objcgs) ++ obj_exts = slab_obj_exts(slab); ++ if (!obj_exts) + return NULL; + + off = obj_to_index(slab->slab_cache, slab, p); +- if (objcgs[off]) +- return obj_cgroup_memcg(objcgs[off]); ++ if (obj_exts[off].objcg) ++ return obj_cgroup_memcg(obj_exts[off].objcg); + + return NULL; + } +@@ -2980,7 +2938,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) + /* + * folio_memcg_check() is used here, because in theory we can encounter + * a folio where the slab flag has been cleared already, but +- * slab->memcg_data has not been freed yet ++ * slab->obj_exts has not been freed yet + * folio_memcg_check() will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ +diff --git a/mm/mempolicy.c b/mm/mempolicy.c +index 1756389a0..aaf767767 100644 +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -2109,7 +2109,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, + { + struct page *page; + +- page = __alloc_pages(gfp, order, nid, NULL); ++ page = __alloc_pages_noprof(gfp, order, nid, NULL); + /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return page; +@@ -2135,15 +2135,15 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + */ + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); +- page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); ++ page = __alloc_pages_noprof(preferred_gfp, order, nid, &pol->nodes); + if (!page) +- page = __alloc_pages(gfp, order, nid, NULL); ++ page = __alloc_pages_noprof(gfp, order, nid, NULL); + + return page; + } + + /** +- * vma_alloc_folio - Allocate a folio for a VMA. ++ * vma_alloc_folio_noprof - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA or NULL if not available. +@@ -2157,7 +2157,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, + * + * Return: The folio on success or NULL if allocation fails. + */ +-struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, ++struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) + { + struct mempolicy *pol; +@@ -2228,7 +2228,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + * memory with both reclaim and compact as well. + */ + if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) +- folio = __folio_alloc(gfp, order, hpage_node, ++ folio = __folio_alloc_noprof(gfp, order, hpage_node, + nmask); + + goto out; +@@ -2237,15 +2237,15 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + + nmask = policy_nodemask(gfp, pol); + preferred_nid = policy_node(gfp, pol, node); +- folio = __folio_alloc(gfp, order, preferred_nid, nmask); ++ folio = __folio_alloc_noprof(gfp, order, preferred_nid, nmask); + mpol_cond_put(pol); + out: + return folio; + } +-EXPORT_SYMBOL(vma_alloc_folio); ++EXPORT_SYMBOL(vma_alloc_folio_noprof); + + /** +- * alloc_pages - Allocate pages. ++ * alloc_pages_noprof - Allocate pages. + * @gfp: GFP flags. + * @order: Power of two of number of pages to allocate. + * +@@ -2258,7 +2258,7 @@ EXPORT_SYMBOL(vma_alloc_folio); + * flags are used. + * Return: The page on success or NULL if allocation fails. + */ +-struct page *alloc_pages(gfp_t gfp, unsigned order) ++struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) + { + struct mempolicy *pol = &default_policy; + struct page *page; +@@ -2276,23 +2276,23 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) + page = alloc_pages_preferred_many(gfp, order, + policy_node(gfp, pol, numa_node_id()), pol); + else +- page = __alloc_pages(gfp, order, ++ page = __alloc_pages_noprof(gfp, order, + policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol)); + + return page; + } +-EXPORT_SYMBOL(alloc_pages); ++EXPORT_SYMBOL(alloc_pages_noprof); + +-struct folio *folio_alloc(gfp_t gfp, unsigned order) ++struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) + { +- struct page *page = alloc_pages(gfp | __GFP_COMP, order); ++ struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; + } +-EXPORT_SYMBOL(folio_alloc); ++EXPORT_SYMBOL(folio_alloc_noprof); + + static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, + struct mempolicy *pol, unsigned long nr_pages, +@@ -2311,13 +2311,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, + + for (i = 0; i < nodes; i++) { + if (delta) { +- nr_allocated = __alloc_pages_bulk(gfp, ++ nr_allocated = alloc_pages_bulk_noprof(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node + 1, NULL, + page_array); + delta--; + } else { +- nr_allocated = __alloc_pages_bulk(gfp, ++ nr_allocated = alloc_pages_bulk_noprof(gfp, + interleave_nodes(pol), NULL, + nr_pages_per_node, NULL, page_array); + } +@@ -2339,11 +2339,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, + preferred_gfp = gfp | __GFP_NOWARN; + preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); + +- nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, ++ nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, + nr_pages, NULL, page_array); + + if (nr_allocated < nr_pages) +- nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, ++ nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, + nr_pages - nr_allocated, NULL, + page_array + nr_allocated); + return nr_allocated; +@@ -2355,7 +2355,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, + * It can accelerate memory allocation especially interleaving + * allocate memory. + */ +-unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, ++unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, + unsigned long nr_pages, struct page **page_array) + { + struct mempolicy *pol = &default_policy; +@@ -2371,7 +2371,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, + return alloc_pages_bulk_array_preferred_many(gfp, + numa_node_id(), pol, nr_pages, page_array); + +- return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), ++ return alloc_pages_bulk_noprof(gfp, policy_node(gfp, pol, numa_node_id()), + policy_nodemask(gfp, pol), nr_pages, NULL, + page_array); + } +diff --git a/mm/mempool.c b/mm/mempool.c +index 734bcf5af..4fd949178 100644 +--- a/mm/mempool.c ++++ b/mm/mempool.c +@@ -230,17 +230,17 @@ EXPORT_SYMBOL(mempool_init_node); + * + * Return: %0 on success, negative error code otherwise. + */ +-int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, +- mempool_free_t *free_fn, void *pool_data) ++int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, ++ mempool_free_t *free_fn, void *pool_data) + { + return mempool_init_node(pool, min_nr, alloc_fn, free_fn, + pool_data, GFP_KERNEL, NUMA_NO_NODE); + + } +-EXPORT_SYMBOL(mempool_init); ++EXPORT_SYMBOL(mempool_init_noprof); + + /** +- * mempool_create - create a memory pool ++ * mempool_create_node - create a memory pool + * @min_nr: the minimum number of elements guaranteed to be + * allocated for this pool. + * @alloc_fn: user-defined element-allocation function. +@@ -255,17 +255,9 @@ EXPORT_SYMBOL(mempool_init); + * + * Return: pointer to the created memory pool object or %NULL on error. + */ +-mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, +- mempool_free_t *free_fn, void *pool_data) +-{ +- return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, +- GFP_KERNEL, NUMA_NO_NODE); +-} +-EXPORT_SYMBOL(mempool_create); +- +-mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, +- mempool_free_t *free_fn, void *pool_data, +- gfp_t gfp_mask, int node_id) ++mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, ++ mempool_free_t *free_fn, void *pool_data, ++ gfp_t gfp_mask, int node_id) + { + mempool_t *pool; + +@@ -281,7 +273,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + + return pool; + } +-EXPORT_SYMBOL(mempool_create_node); ++EXPORT_SYMBOL(mempool_create_node_noprof); + + /** + * mempool_resize - resize an existing memory pool +@@ -377,7 +369,7 @@ EXPORT_SYMBOL(mempool_resize); + * + * Return: pointer to the allocated element or %NULL on error. + */ +-void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) ++void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) + { + void *element; + unsigned long flags; +@@ -444,7 +436,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) + finish_wait(&pool->wait, &wait); + goto repeat_alloc; + } +-EXPORT_SYMBOL(mempool_alloc); ++EXPORT_SYMBOL(mempool_alloc_noprof); + + /** + * mempool_free - return an element to the pool. +@@ -515,7 +507,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) + { + struct kmem_cache *mem = pool_data; + VM_BUG_ON(mem->ctor); +- return kmem_cache_alloc(mem, gfp_mask); ++ return kmem_cache_alloc_noprof(mem, gfp_mask); + } + EXPORT_SYMBOL(mempool_alloc_slab); + +@@ -533,7 +525,7 @@ EXPORT_SYMBOL(mempool_free_slab); + void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) + { + size_t size = (size_t)pool_data; +- return kmalloc(size, gfp_mask); ++ return kmalloc_noprof(size, gfp_mask); + } + EXPORT_SYMBOL(mempool_kmalloc); + +@@ -550,7 +542,7 @@ EXPORT_SYMBOL(mempool_kfree); + void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) + { + int order = (int)(long)pool_data; +- return alloc_pages(gfp_mask, order); ++ return alloc_pages_noprof(gfp_mask, order); + } + EXPORT_SYMBOL(mempool_alloc_pages); + +diff --git a/mm/mm_init.c b/mm/mm_init.c +index 7f7f9c677..42135fad4 100644 +--- a/mm/mm_init.c ++++ b/mm/mm_init.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include "internal.h" diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 044e1eed7..f2657245e 100644 --- a/mm/oom_kill.c @@ -95995,11 +101165,571 @@ index 044e1eed7..f2657245e 100644 } if (sysctl_oom_dump_tasks) dump_tasks(oc); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 47421bedc..e20ef7a00 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -74,6 +74,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1259,6 +1260,7 @@ static __always_inline bool free_pages_prepare(struct page *page, + __memcg_kmem_uncharge_page(page, order); + reset_page_owner(page, order); + page_table_check_free(page, order); ++ pgalloc_tag_sub(page, order); + return false; + } + +@@ -1301,6 +1303,7 @@ static __always_inline bool free_pages_prepare(struct page *page, + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); + page_table_check_free(page, order); ++ pgalloc_tag_sub(page, order); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), +@@ -1730,6 +1733,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, + + set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); ++ pgalloc_tag_add(page, current, order); + } + + static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, +@@ -2790,6 +2794,7 @@ void split_page(struct page *page, unsigned int order) + for (i = 1; i < (1 << order); i++) + set_page_refcounted(page + i); + split_page_owner(page, 1 << order); ++ pgalloc_tag_split(page, 1 << order); + split_page_memcg(page, 1 << order); + } + EXPORT_SYMBOL_GPL(split_page); +@@ -4577,7 +4582,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, + * + * Returns the number of pages on the list or array. + */ +-unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, ++unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, + nodemask_t *nodemask, int nr_pages, + struct list_head *page_list, + struct page **page_array) +@@ -4713,7 +4718,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + pcp_trylock_finish(UP_flags); + + failed: +- page = __alloc_pages(gfp, 0, preferred_nid, nodemask); ++ page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); + if (page) { + if (page_list) + list_add(&page->lru, page_list); +@@ -4724,13 +4729,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, + + goto out; + } +-EXPORT_SYMBOL_GPL(__alloc_pages_bulk); ++EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); + + /* + * This is the 'heart' of the zoned buddy allocator. + */ +-struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, +- nodemask_t *nodemask) ++struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, ++ int preferred_nid, nodemask_t *nodemask) + { + struct page *page; + unsigned int alloc_flags = ALLOC_WMARK_LOW; +@@ -4792,41 +4797,41 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, + + return page; + } +-EXPORT_SYMBOL(__alloc_pages); ++EXPORT_SYMBOL(__alloc_pages_noprof); + +-struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, ++struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, + nodemask_t *nodemask) + { +- struct page *page = __alloc_pages(gfp | __GFP_COMP, order, ++ struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, + preferred_nid, nodemask); + + if (page && order > 1) + prep_transhuge_page(page); + return (struct folio *)page; + } +-EXPORT_SYMBOL(__folio_alloc); ++EXPORT_SYMBOL(__folio_alloc_noprof); + + /* + * Common helper functions. Never use with __GFP_HIGHMEM because the returned + * address cannot represent highmem pages. Use alloc_pages and then kmap if + * you need to access high mem. + */ +-unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) ++unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) + { + struct page *page; + +- page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); ++ page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); + if (!page) + return 0; + return (unsigned long) page_address(page); + } +-EXPORT_SYMBOL(__get_free_pages); ++EXPORT_SYMBOL(get_free_pages_noprof); + +-unsigned long get_zeroed_page(gfp_t gfp_mask) ++unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) + { +- return __get_free_page(gfp_mask | __GFP_ZERO); ++ return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); + } +-EXPORT_SYMBOL(get_zeroed_page); ++EXPORT_SYMBOL(get_zeroed_page_noprof); + + /** + * __free_pages - Free pages allocated with alloc_pages(). +@@ -5006,6 +5011,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, + struct page *last = page + nr; + + split_page_owner(page, 1 << order); ++ pgalloc_tag_split(page, 1 << order); + split_page_memcg(page, 1 << order); + while (page < --last) + set_page_refcounted(last); +@@ -5018,7 +5024,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, + } + + /** +- * alloc_pages_exact - allocate an exact number physically-contiguous pages. ++ * alloc_pages_exact_noprof - allocate an exact number physically-contiguous pages. + * @size: the number of bytes to allocate + * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP + * +@@ -5032,7 +5038,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, + * + * Return: pointer to the allocated area or %NULL in case of error. + */ +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask) ++void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) + { + unsigned int order = get_order(size); + unsigned long addr; +@@ -5040,13 +5046,13 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); + +- addr = __get_free_pages(gfp_mask, order); ++ addr = get_free_pages_noprof(gfp_mask, order); + return make_alloc_exact(addr, order, size); + } +-EXPORT_SYMBOL(alloc_pages_exact); ++EXPORT_SYMBOL(alloc_pages_exact_noprof); + + /** +- * alloc_pages_exact_nid - allocate an exact number of physically-contiguous ++ * alloc_pages_exact_nid_noprof - allocate an exact number of physically-contiguous + * pages on a node. + * @nid: the preferred node ID where memory should be allocated + * @size: the number of bytes to allocate +@@ -5057,7 +5063,7 @@ EXPORT_SYMBOL(alloc_pages_exact); + * + * Return: pointer to the allocated area or %NULL in case of error. + */ +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) ++void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) + { + unsigned int order = get_order(size); + struct page *p; +@@ -5065,7 +5071,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) + if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) + gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); + +- p = alloc_pages_node(nid, gfp_mask, order); ++ p = alloc_pages_node_noprof(nid, gfp_mask, order); + if (!p) + return NULL; + return make_alloc_exact((unsigned long)page_address(p), order, size); +@@ -6738,7 +6744,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, + } + + /** +- * alloc_contig_range() -- tries to allocate given range of pages ++ * alloc_contig_range_noprof() -- tries to allocate given range of pages + * @start: start PFN to allocate + * @end: one-past-the-last PFN to allocate + * @migratetype: migratetype of the underlying pageblocks (either +@@ -6758,7 +6764,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, + * pages which PFN is in [start, end) are allocated for the caller and + * need to be freed with free_contig_range(). + */ +-int alloc_contig_range(unsigned long start, unsigned long end, ++int alloc_contig_range_noprof(unsigned long start, unsigned long end, + unsigned migratetype, gfp_t gfp_mask) + { + unsigned long outer_start, outer_end; +@@ -6882,15 +6888,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, + undo_isolate_page_range(start, end, migratetype); + return ret; + } +-EXPORT_SYMBOL(alloc_contig_range); ++EXPORT_SYMBOL(alloc_contig_range_noprof); + + static int __alloc_contig_pages(unsigned long start_pfn, + unsigned long nr_pages, gfp_t gfp_mask) + { + unsigned long end_pfn = start_pfn + nr_pages; + +- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, +- gfp_mask); ++ return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, ++ gfp_mask); + } + + static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, +@@ -6925,7 +6931,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, + } + + /** +- * alloc_contig_pages() -- tries to find and allocate contiguous range of pages ++ * alloc_contig_pages_noprof() -- tries to find and allocate contiguous range of pages + * @nr_pages: Number of contiguous pages to allocate + * @gfp_mask: GFP mask to limit search and used during compaction + * @nid: Target node +@@ -6945,8 +6951,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, + * + * Return: pointer to contiguous pages on success, or NULL if not successful. + */ +-struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, +- int nid, nodemask_t *nodemask) ++struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, ++ int nid, nodemask_t *nodemask) + { + unsigned long ret, pfn, flags; + struct zonelist *zonelist; +diff --git a/mm/page_ext.c b/mm/page_ext.c +index dc1626be4..6c8ad6e12 100644 +--- a/mm/page_ext.c ++++ b/mm/page_ext.c +@@ -10,6 +10,7 @@ + #include + #include + #include ++#include + + /* + * struct page extension +@@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) + &page_idle_ops, + #endif ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ &page_alloc_tagging_ops, ++#endif + #ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, + #endif +@@ -92,7 +96,16 @@ unsigned long page_ext_size; + static unsigned long total_usage; + static struct page_ext *lookup_page_ext(const struct page *page); + ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++/* ++ * To ensure correct allocation tagging for pages, page_ext should be available ++ * before the first page allocation. Otherwise early task stacks will be ++ * allocated before page_ext initialization and missing tags will be flagged. ++ */ ++bool early_page_ext __meminitdata = true; ++#else + bool early_page_ext __meminitdata; ++#endif + static int __init setup_early_page_ext(char *str) + { + early_page_ext = true; +diff --git a/mm/page_owner.c b/mm/page_owner.c +index 31169b3e7..8b6086c66 100644 +--- a/mm/page_owner.c ++++ b/mm/page_owner.c +@@ -372,7 +372,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + if (!memcg_data) + goto out_unlock; + +- if (memcg_data & MEMCG_DATA_OBJCGS) ++ if (memcg_data & MEMCG_DATA_OBJEXTS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + +diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h +index f9847c131..c5d1d6723 100644 +--- a/mm/percpu-internal.h ++++ b/mm/percpu-internal.h +@@ -32,6 +32,19 @@ struct pcpu_block_md { + int nr_bits; /* total bits responsible for */ + }; + ++struct pcpuobj_ext { ++#ifdef CONFIG_MEMCG_KMEM ++ struct obj_cgroup *cgroup; ++#endif ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ union codetag_ref tag; ++#endif ++}; ++ ++#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) ++#define NEED_PCPUOBJ_EXT ++#endif ++ + struct pcpu_chunk { + #ifdef CONFIG_PERCPU_STATS + int nr_alloc; /* # of allocations */ +@@ -57,8 +70,8 @@ struct pcpu_chunk { + int end_offset; /* additional area required to + have the region end page + aligned */ +-#ifdef CONFIG_MEMCG_KMEM +- struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ ++#ifdef NEED_PCPUOBJ_EXT ++ struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ + #endif + + int nr_pages; /* # of pages served by this chunk */ +@@ -67,6 +80,15 @@ struct pcpu_chunk { + unsigned long populated[]; /* populated bitmap */ + }; + ++static inline bool need_pcpuobj_ext(void) ++{ ++ if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) ++ return true; ++ if (!mem_cgroup_kmem_disabled()) ++ return true; ++ return false; ++} ++ + extern spinlock_t pcpu_lock; + + extern struct list_head *pcpu_chunk_lists; +diff --git a/mm/percpu.c b/mm/percpu.c +index 28e07ede4..2298f38d4 100644 +--- a/mm/percpu.c ++++ b/mm/percpu.c +@@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, + panic("%s: Failed to allocate %zu bytes\n", __func__, + alloc_size); + +-#ifdef CONFIG_MEMCG_KMEM ++#ifdef NEED_PCPUOBJ_EXT + /* first chunk is free to use */ +- chunk->obj_cgroups = NULL; ++ chunk->obj_exts = NULL; + #endif + pcpu_init_md_blocks(chunk); + +@@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) + if (!chunk->md_blocks) + goto md_blocks_fail; + +-#ifdef CONFIG_MEMCG_KMEM +- if (!mem_cgroup_kmem_disabled()) { +- chunk->obj_cgroups = ++#ifdef NEED_PCPUOBJ_EXT ++ if (need_pcpuobj_ext()) { ++ chunk->obj_exts = + pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * +- sizeof(struct obj_cgroup *), gfp); +- if (!chunk->obj_cgroups) ++ sizeof(struct pcpuobj_ext), gfp); ++ if (!chunk->obj_exts) + goto objcg_fail; + } + #endif +@@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) + + return chunk; + +-#ifdef CONFIG_MEMCG_KMEM ++#ifdef NEED_PCPUOBJ_EXT + objcg_fail: + pcpu_mem_free(chunk->md_blocks); + #endif +@@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) + { + if (!chunk) + return; +-#ifdef CONFIG_MEMCG_KMEM +- pcpu_mem_free(chunk->obj_cgroups); ++#ifdef NEED_PCPUOBJ_EXT ++ pcpu_mem_free(chunk->obj_exts); + #endif + pcpu_mem_free(chunk->md_blocks); + pcpu_mem_free(chunk->bound_map); +@@ -1648,8 +1648,8 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, + if (!objcg) + return; + +- if (likely(chunk && chunk->obj_cgroups)) { +- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; ++ if (likely(chunk && chunk->obj_exts)) { ++ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; + + rcu_read_lock(); + mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, +@@ -1665,13 +1665,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) + { + struct obj_cgroup *objcg; + +- if (unlikely(!chunk->obj_cgroups)) ++ if (unlikely(!chunk->obj_exts)) + return; + +- objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; ++ objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; + if (!objcg) + return; +- chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; ++ chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; + + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); + +@@ -1701,8 +1701,34 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) + } + #endif /* CONFIG_MEMCG_KMEM */ + ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, ++ size_t size) ++{ ++ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { ++ alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, ++ current->alloc_tag, size); ++ } ++} ++ ++static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) ++{ ++ if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) ++ alloc_tag_sub_noalloc(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); ++} ++#else ++static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, ++ size_t size) ++{ ++} ++ ++static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) ++{ ++} ++#endif ++ + /** +- * pcpu_alloc - the percpu allocator ++ * pcpu_alloc_noprof - the percpu allocator + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available +@@ -1716,7 +1742,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +-static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, ++void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, + gfp_t gfp) + { + gfp_t pcpu_gfp; +@@ -1883,6 +1909,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + + pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + ++ pcpu_alloc_tag_alloc_hook(chunk, off, size); ++ + return ptr; + + fail_unlock: +@@ -1909,61 +1937,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, + + return NULL; + } +- +-/** +- * __alloc_percpu_gfp - allocate dynamic percpu area +- * @size: size of area to allocate in bytes +- * @align: alignment of area (max PAGE_SIZE) +- * @gfp: allocation flags +- * +- * Allocate zero-filled percpu area of @size bytes aligned at @align. If +- * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can +- * be called from any context but is a lot more likely to fail. If @gfp +- * has __GFP_NOWARN then no warning will be triggered on invalid or failed +- * allocation requests. +- * +- * RETURNS: +- * Percpu pointer to the allocated area on success, NULL on failure. +- */ +-void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) +-{ +- return pcpu_alloc(size, align, false, gfp); +-} +-EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); +- +-/** +- * __alloc_percpu - allocate dynamic percpu area +- * @size: size of area to allocate in bytes +- * @align: alignment of area (max PAGE_SIZE) +- * +- * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). +- */ +-void __percpu *__alloc_percpu(size_t size, size_t align) +-{ +- return pcpu_alloc(size, align, false, GFP_KERNEL); +-} +-EXPORT_SYMBOL_GPL(__alloc_percpu); +- +-/** +- * __alloc_reserved_percpu - allocate reserved percpu area +- * @size: size of area to allocate in bytes +- * @align: alignment of area (max PAGE_SIZE) +- * +- * Allocate zero-filled percpu area of @size bytes aligned at @align +- * from reserved percpu area if arch has set it up; otherwise, +- * allocation is served from the same dynamic area. Might sleep. +- * Might trigger writeouts. +- * +- * CONTEXT: +- * Does GFP_KERNEL allocation. +- * +- * RETURNS: +- * Percpu pointer to the allocated area on success, NULL on failure. +- */ +-void __percpu *__alloc_reserved_percpu(size_t size, size_t align) +-{ +- return pcpu_alloc(size, align, true, GFP_KERNEL); +-} ++EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); + + /** + * pcpu_balance_free - manage the amount of free chunks +@@ -2273,6 +2247,8 @@ void free_percpu(void __percpu *ptr) + + size = pcpu_free_area(chunk, off); + ++ pcpu_alloc_tag_free_hook(chunk, off, size); ++ + pcpu_memcg_free_hook(chunk, off, size); + + /* diff --git a/lib/show_mem.c b/mm/show_mem.c -similarity index 69% +similarity index 57% rename from lib/show_mem.c rename to mm/show_mem.c -index 1485c87be..e7f05c870 100644 +index 1485c87be..de209c55d 100644 --- a/lib/show_mem.c +++ b/mm/show_mem.c @@ -7,11 +7,15 @@ @@ -96018,7 +101748,7 @@ index 1485c87be..e7f05c870 100644 printk("Mem-Info:\n"); __show_free_areas(filter, nodemask, max_zone_idx); -@@ -34,4 +38,23 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) +@@ -34,4 +38,37 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif @@ -96041,12 +101771,470 @@ index 1485c87be..e7f05c870 100644 + + kfree(buf); + } ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ { ++ struct seq_buf s; ++ char *buf = kmalloc(4096, GFP_ATOMIC); ++ ++ if (buf) { ++ printk("Memory allocations:\n"); ++ seq_buf_init(&s, buf, 4096); ++ alloc_tags_show_mem_report(&s); ++ printk("%s", buf); ++ kfree(buf); ++ } ++ } ++#endif } +diff --git a/mm/slab.c b/mm/slab.c +index bb57f7fdb..d02d2dd27 100644 +--- a/mm/slab.c ++++ b/mm/slab.c +@@ -1232,7 +1232,7 @@ void __init kmem_cache_init(void) + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), +- SLAB_HWCACHE_ALIGN, 0, 0); ++ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); + list_add(&kmem_cache->list, &slab_caches); + slab_state = PARTIAL; + +@@ -3367,9 +3367,11 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) + static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) + { ++ struct slab *slab = virt_to_slab(objp); + bool init; + +- memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); ++ memcg_slab_free_hook(cachep, slab, &objp, 1); ++ alloc_tagging_slab_free_hook(cachep, slab, &objp, 1); + + if (is_kfence_address(objp)) { + kmemleak_free_recursive(objp, cachep->flags); +@@ -3446,18 +3448,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, + return ret; + } + +-void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) ++void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) + { + return __kmem_cache_alloc_lru(cachep, NULL, flags); + } +-EXPORT_SYMBOL(kmem_cache_alloc); ++EXPORT_SYMBOL(kmem_cache_alloc_noprof); + +-void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, ++void *kmem_cache_alloc_lru_noprof(struct kmem_cache *cachep, struct list_lru *lru, + gfp_t flags) + { + return __kmem_cache_alloc_lru(cachep, lru, flags); + } +-EXPORT_SYMBOL(kmem_cache_alloc_lru); ++EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); + + static __always_inline void + cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, +@@ -3469,8 +3471,8 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, + p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); + } + +-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +- void **p) ++int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, ++ void **p) + { + struct obj_cgroup *objcg = NULL; + unsigned long irqflags; +@@ -3508,7 +3510,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + kmem_cache_free_bulk(s, i, p); + return 0; + } +-EXPORT_SYMBOL(kmem_cache_alloc_bulk); ++EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); + + /** + * kmem_cache_alloc_node - Allocate an object on the specified node +@@ -3523,7 +3525,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * + * Return: pointer to the new object or %NULL in case of error + */ +-void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) ++void *kmem_cache_alloc_node_noprof(struct kmem_cache *cachep, gfp_t flags, int nodeid) + { + void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); + +@@ -3531,7 +3533,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) + + return ret; + } +-EXPORT_SYMBOL(kmem_cache_alloc_node); ++EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); + + void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, + int nodeid, size_t orig_size, diff --git a/mm/slab.h b/mm/slab.h -index f01ac256a..063e0d346 100644 +index f01ac256a..bc2d3429d 100644 --- a/mm/slab.h +++ b/mm/slab.h -@@ -766,10 +766,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +@@ -57,8 +57,8 @@ struct slab { + #endif + + atomic_t __page_refcount; +-#ifdef CONFIG_MEMCG +- unsigned long memcg_data; ++#ifdef CONFIG_SLAB_OBJ_EXT ++ unsigned long obj_exts; + #endif + }; + +@@ -67,8 +67,8 @@ struct slab { + SLAB_MATCH(flags, __page_flags); + SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ + SLAB_MATCH(_refcount, __page_refcount); +-#ifdef CONFIG_MEMCG +-SLAB_MATCH(memcg_data, memcg_data); ++#ifdef CONFIG_SLAB_OBJ_EXT ++SLAB_MATCH(memcg_data, obj_exts); + #endif + #undef SLAB_MATCH + static_assert(sizeof(struct slab) <= sizeof(struct page)); +@@ -390,36 +390,198 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla + return false; + } + +-#ifdef CONFIG_MEMCG_KMEM ++#ifdef CONFIG_SLAB_OBJ_EXT ++ + /* +- * slab_objcgs - get the object cgroups vector associated with a slab ++ * slab_obj_exts - get the pointer to the slab object extension vector ++ * associated with a slab. + * @slab: a pointer to the slab struct + * +- * Returns a pointer to the object cgroups vector associated with the slab, ++ * Returns a pointer to the object extension vector associated with the slab, + * or NULL if no such vector has been associated yet. + */ +-static inline struct obj_cgroup **slab_objcgs(struct slab *slab) ++static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) + { +- unsigned long memcg_data = READ_ONCE(slab->memcg_data); ++ unsigned long obj_exts = READ_ONCE(slab->obj_exts); + +- VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), ++#ifdef CONFIG_MEMCG ++ VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), + slab_page(slab)); +- VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); ++ VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); + +- return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); ++#endif ++ return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); + } + +-int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, +- gfp_t gfp, bool new_slab); +-void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, +- enum node_stat_item idx, int nr); ++int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, ++ gfp_t gfp, bool new_slab); ++ ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG ++ ++static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) ++{ ++ struct slabobj_ext *slab_exts; ++ struct slab *obj_exts_slab; ++ ++ obj_exts_slab = virt_to_slab(obj_exts); ++ slab_exts = slab_obj_exts(obj_exts_slab); ++ if (slab_exts) { ++ unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, ++ obj_exts_slab, obj_exts); ++ /* codetag should be NULL */ ++ WARN_ON(slab_exts[offs].ref.ct); ++ set_codetag_empty(&slab_exts[offs].ref); ++ } ++} ++ ++static inline void mark_failed_objexts_alloc(struct slab *slab) ++{ ++ slab->obj_exts = OBJEXTS_ALLOC_FAIL; ++} ++ ++static inline void handle_failed_objexts_alloc(unsigned long obj_exts, ++ struct slabobj_ext *vec, unsigned int objects) ++{ ++ /* ++ * If vector previously failed to allocate then we have live ++ * objects with no tag reference. Mark all references in this ++ * vector as empty to avoid warnings later on. ++ */ ++ if (obj_exts & OBJEXTS_ALLOC_FAIL) { ++ unsigned int i; ++ ++ for (i = 0; i < objects; i++) ++ set_codetag_empty(&vec[i].ref); ++ } ++} ++ ++ ++#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ ++ ++static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} ++static inline void mark_failed_objexts_alloc(struct slab *slab) {} ++static inline void handle_failed_objexts_alloc(unsigned long obj_exts, ++ struct slabobj_ext *vec, unsigned int objects) {} ++ ++#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ ++ ++static inline bool need_slab_obj_ext(void) ++{ ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ if (mem_alloc_profiling_enabled()) ++ return true; ++#endif ++ /* ++ * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally ++ * inside memcg_slab_post_alloc_hook. No other users for now. ++ */ ++ return false; ++} ++ ++static inline void free_slab_obj_exts(struct slab *slab) ++{ ++ struct slabobj_ext *obj_exts; ++ ++ obj_exts = slab_obj_exts(slab); ++ if (!obj_exts) ++ return; ++ ++ /* ++ * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its ++ * corresponding extension will be NULL. alloc_tag_sub() will throw a ++ * warning if slab has extensions but the extension of an object is ++ * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that ++ * the extension for obj_exts is expected to be NULL. ++ */ ++ mark_objexts_empty(obj_exts); ++ kfree(obj_exts); ++ slab->obj_exts = 0; ++} ++ ++static inline struct slabobj_ext * ++prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) ++{ ++ struct slab *slab; ++ ++ if (!p) ++ return NULL; ++ ++ if (!need_slab_obj_ext()) ++ return NULL; ++ ++ if (s->flags & SLAB_NO_OBJ_EXT) ++ return NULL; + +-static inline void memcg_free_slab_cgroups(struct slab *slab) ++ if (flags & __GFP_NO_OBJ_EXT) ++ return NULL; ++ ++ slab = virt_to_slab(p); ++ if (!slab_obj_exts(slab) && ++ WARN(alloc_slab_obj_exts(slab, s, flags, false), ++ "%s, %s: Failed to create slab extension vector!\n", ++ __func__, s->name)) ++ return NULL; ++ ++ return slab_obj_exts(slab) + obj_to_index(s, slab, p); ++} ++ ++#else /* CONFIG_SLAB_OBJ_EXT */ ++ ++static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) + { +- kfree(slab_objcgs(slab)); +- slab->memcg_data = 0; ++ return NULL; ++} ++ ++static inline int alloc_slab_obj_exts(struct slab *slab, ++ struct kmem_cache *s, gfp_t gfp, ++ bool new_slab) ++{ ++ return 0; ++} ++ ++static inline void free_slab_obj_exts(struct slab *slab) ++{ ++} ++ ++static inline struct slabobj_ext * ++prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) ++{ ++ return NULL; ++} ++ ++#endif /* CONFIG_SLAB_OBJ_EXT */ ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ ++static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, ++ void **p, int objects) ++{ ++ struct slabobj_ext *obj_exts; ++ int i; ++ ++ obj_exts = slab_obj_exts(slab); ++ if (!obj_exts) ++ return; ++ ++ for (i = 0; i < objects; i++) { ++ unsigned int off = obj_to_index(s, slab, p[i]); ++ ++ alloc_tag_sub(&obj_exts[off].ref, s->size); ++ } + } + ++#else ++ ++static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, ++ void **p, int objects) {} ++ ++#endif /* CONFIG_MEM_ALLOC_PROFILING */ ++ ++#ifdef CONFIG_MEMCG_KMEM ++void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, ++ enum node_stat_item idx, int nr); ++ + static inline size_t obj_full_size(struct kmem_cache *s) + { + /* +@@ -487,16 +649,15 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + if (likely(p[i])) { + slab = virt_to_slab(p[i]); + +- if (!slab_objcgs(slab) && +- memcg_alloc_slab_cgroups(slab, s, flags, +- false)) { ++ if (!slab_obj_exts(slab) && ++ alloc_slab_obj_exts(slab, s, flags, false)) { + obj_cgroup_uncharge(objcg, obj_full_size(s)); + continue; + } + + off = obj_to_index(s, slab, p[i]); + obj_cgroup_get(objcg); +- slab_objcgs(slab)[off] = objcg; ++ slab_obj_exts(slab)[off].objcg = objcg; + mod_objcg_state(objcg, slab_pgdat(slab), + cache_vmstat_idx(s), obj_full_size(s)); + } else { +@@ -509,14 +670,14 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, + static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) + { +- struct obj_cgroup **objcgs; ++ struct slabobj_ext *obj_exts; + int i; + + if (!memcg_kmem_online()) + return; + +- objcgs = slab_objcgs(slab); +- if (!objcgs) ++ obj_exts = slab_obj_exts(slab); ++ if (!obj_exts) + return; + + for (i = 0; i < objects; i++) { +@@ -524,11 +685,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + unsigned int off; + + off = obj_to_index(s, slab, p[i]); +- objcg = objcgs[off]; ++ objcg = obj_exts[off].objcg; + if (!objcg) + continue; + +- objcgs[off] = NULL; ++ obj_exts[off].objcg = NULL; + obj_cgroup_uncharge(objcg, obj_full_size(s)); + mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), + -obj_full_size(s)); +@@ -537,27 +698,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, + } + + #else /* CONFIG_MEMCG_KMEM */ +-static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +-{ +- return NULL; +-} +- + static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) + { + return NULL; + } + +-static inline int memcg_alloc_slab_cgroups(struct slab *slab, +- struct kmem_cache *s, gfp_t gfp, +- bool new_slab) +-{ +- return 0; +-} +- +-static inline void memcg_free_slab_cgroups(struct slab *slab) +-{ +-} +- + static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, + struct list_lru *lru, + struct obj_cgroup **objcgp, +@@ -594,7 +739,7 @@ static __always_inline void account_slab(struct slab *slab, int order, + struct kmem_cache *s, gfp_t gfp) + { + if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) +- memcg_alloc_slab_cgroups(slab, s, gfp, true); ++ alloc_slab_obj_exts(slab, s, gfp, true); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + PAGE_SIZE << order); +@@ -603,8 +748,7 @@ static __always_inline void account_slab(struct slab *slab, int order, + static __always_inline void unaccount_slab(struct slab *slab, int order, + struct kmem_cache *s) + { +- if (memcg_kmem_online()) +- memcg_free_slab_cgroups(slab); ++ free_slab_obj_exts(slab); + + mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), + -(PAGE_SIZE << order)); +@@ -684,6 +828,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, + unsigned int orig_size) + { + unsigned int zero_size = s->object_size; ++ struct slabobj_ext *obj_exts; + size_t i; + + flags &= gfp_allowed_mask; +@@ -714,6 +859,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, + kmemleak_alloc_recursive(p[i], s->object_size, 1, + s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); ++ obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); ++ ++#ifdef CONFIG_MEM_ALLOC_PROFILING ++ /* obj_exts can be allocated for other reasons */ ++ if (likely(obj_exts) && mem_alloc_profiling_enabled()) ++ alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); ++#endif + } + + memcg_slab_post_alloc_hook(s, objcg, flags, size, p); +@@ -766,10 +918,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) if ((__n = get_node(__s, __node))) @@ -96062,7 +102250,7 @@ index f01ac256a..063e0d346 100644 } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c -index 607249785..53d463d1b 100644 +index 607249785..5b204e16f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -24,6 +24,7 @@ @@ -96073,7 +102261,173 @@ index 607249785..53d463d1b 100644 #include #include "internal.h" -@@ -1259,10 +1260,15 @@ static int slab_show(struct seq_file *m, void *p) +@@ -204,6 +205,64 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, + return NULL; + } + ++#ifdef CONFIG_SLAB_OBJ_EXT ++/* ++ * The allocated objcg pointers array is not accounted directly. ++ * Moreover, it should not come from DMA buffer and is not readily ++ * reclaimable. So those GFP bits should be masked off. ++ */ ++#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) ++ ++int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, ++ gfp_t gfp, bool new_slab) ++{ ++ unsigned int objects = objs_per_slab(s, slab); ++ unsigned long new_exts; ++ unsigned long old_exts; ++ struct slabobj_ext *vec; ++ ++ gfp &= ~OBJCGS_CLEAR_MASK; ++ /* Prevent recursive extension vector allocation */ ++ gfp |= __GFP_NO_OBJ_EXT; ++ vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, ++ slab_nid(slab)); ++ if (!vec) { ++ /* Mark vectors which failed to allocate */ ++ if (new_slab) ++ mark_failed_objexts_alloc(slab); ++ ++ return -ENOMEM; ++ } ++ ++ new_exts = (unsigned long)vec; ++#ifdef CONFIG_MEMCG ++ new_exts |= MEMCG_DATA_OBJEXTS; ++#endif ++ old_exts = slab->obj_exts; ++ handle_failed_objexts_alloc(old_exts, vec, objects); ++ if (new_slab) { ++ /* ++ * If the slab is brand new and nobody can yet access its ++ * obj_exts, no synchronization is required and obj_exts can ++ * be simply assigned. ++ */ ++ slab->obj_exts = new_exts; ++ } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { ++ /* ++ * If the slab is already in use, somebody can allocate and ++ * assign slabobj_exts in parallel. In this case the existing ++ * objcg vector should be reused. ++ */ ++ mark_objexts_empty(vec); ++ kfree(vec); ++ return 0; ++ } ++ ++ kmemleak_not_leak(vec); ++ return 0; ++} ++#endif /* CONFIG_SLAB_OBJ_EXT */ ++ + static struct kmem_cache *create_cache(const char *name, + unsigned int object_size, unsigned int align, + slab_flags_t flags, unsigned int useroffset, +@@ -968,24 +1027,24 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller + return ret; + } + +-void *__kmalloc_node(size_t size, gfp_t flags, int node) ++void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) + { + return __do_kmalloc_node(size, flags, node, _RET_IP_); + } +-EXPORT_SYMBOL(__kmalloc_node); ++EXPORT_SYMBOL(__kmalloc_node_noprof); + +-void *__kmalloc(size_t size, gfp_t flags) ++void *__kmalloc_noprof(size_t size, gfp_t flags) + { + return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); + } +-EXPORT_SYMBOL(__kmalloc); ++EXPORT_SYMBOL(__kmalloc_noprof); + +-void *__kmalloc_node_track_caller(size_t size, gfp_t flags, +- int node, unsigned long caller) ++void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, ++ int node, unsigned long caller) + { + return __do_kmalloc_node(size, flags, node, caller); + } +-EXPORT_SYMBOL(__kmalloc_node_track_caller); ++EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); + + /** + * kfree - free previously allocated memory +@@ -1052,7 +1111,7 @@ size_t __ksize(const void *object) + return slab_ksize(folio_slab(folio)->slab_cache); + } + +-void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) ++void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) + { + void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, + size, _RET_IP_); +@@ -1062,9 +1121,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; + } +-EXPORT_SYMBOL(kmalloc_trace); ++EXPORT_SYMBOL(kmalloc_trace_noprof); + +-void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, ++void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) + { + void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); +@@ -1074,7 +1133,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, + ret = kasan_kmalloc(s, ret, size, gfpflags); + return ret; + } +-EXPORT_SYMBOL(kmalloc_node_trace); ++EXPORT_SYMBOL(kmalloc_node_trace_noprof); + + gfp_t kmalloc_fix_flags(gfp_t flags) + { +@@ -1104,7 +1163,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) + flags = kmalloc_fix_flags(flags); + + flags |= __GFP_COMP; +- page = alloc_pages_node(node, flags, order); ++ page = alloc_pages_node_noprof(node, flags, order); + if (page) { + ptr = page_address(page); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, +@@ -1119,7 +1178,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) + return ptr; + } + +-void *kmalloc_large(size_t size, gfp_t flags) ++void *kmalloc_large_noprof(size_t size, gfp_t flags) + { + void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); + +@@ -1127,9 +1186,9 @@ void *kmalloc_large(size_t size, gfp_t flags) + flags, NUMA_NO_NODE); + return ret; + } +-EXPORT_SYMBOL(kmalloc_large); ++EXPORT_SYMBOL(kmalloc_large_noprof); + +-void *kmalloc_large_node(size_t size, gfp_t flags, int node) ++void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) + { + void *ret = __kmalloc_large_node(size, flags, node); + +@@ -1137,7 +1196,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) + flags, node); + return ret; + } +-EXPORT_SYMBOL(kmalloc_large_node); ++EXPORT_SYMBOL(kmalloc_large_node_noprof); + + #ifdef CONFIG_SLAB_FREELIST_RANDOM + /* Randomize a generic freelist */ +@@ -1259,10 +1318,15 @@ static int slab_show(struct seq_file *m, void *p) return 0; } @@ -96090,7 +102444,7 @@ index 607249785..53d463d1b 100644 /* * Here acquiring slab_mutex is risky since we don't prefer to get -@@ -1272,24 +1278,52 @@ void dump_unreclaimable_slab(void) +@@ -1272,24 +1336,52 @@ void dump_unreclaimable_slab(void) * without acquiring the mutex. */ if (!mutex_trylock(&slab_mutex)) { @@ -96138,7 +102492,7 @@ index 607249785..53d463d1b 100644 + } + + slabs_by_mem[i] = n; -+ } + } + + for (i = nr - 1; i >= 0; --i) { + seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); @@ -96146,11 +102500,554 @@ index 607249785..53d463d1b 100644 + seq_buf_printf(out, " active: "); + seq_buf_human_readable_u64(out, slabs_by_mem[i].active); + seq_buf_putc(out, '\n'); - } ++ } + mutex_unlock(&slab_mutex); } +@@ -1356,7 +1448,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) + return (void *)p; + } + +- ret = kmalloc_track_caller(new_size, flags); ++ ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); + if (ret && p) { + /* Disable KASAN checks as the object's redzone is accessed. */ + kasan_disable_current(); +@@ -1380,7 +1472,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +-void *krealloc(const void *p, size_t new_size, gfp_t flags) ++void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) + { + void *ret; + +@@ -1395,7 +1487,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) + + return ret; + } +-EXPORT_SYMBOL(krealloc); ++EXPORT_SYMBOL(krealloc_noprof); + + /** + * kfree_sensitive - Clear sensitive information in memory before freeing +diff --git a/mm/slub.c b/mm/slub.c +index c87628cd8..768b0e292 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -1781,7 +1781,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, + return kasan_slab_free(s, x, init); + } + +-static inline bool slab_free_freelist_hook(struct kmem_cache *s, ++static __always_inline bool slab_free_freelist_hook(struct kmem_cache *s, + void **head, void **tail, + int *cnt) + { +@@ -3470,18 +3470,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, + return ret; + } + +-void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) ++void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) + { + return __kmem_cache_alloc_lru(s, NULL, gfpflags); + } +-EXPORT_SYMBOL(kmem_cache_alloc); ++EXPORT_SYMBOL(kmem_cache_alloc_noprof); + +-void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, ++void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) + { + return __kmem_cache_alloc_lru(s, lru, gfpflags); + } +-EXPORT_SYMBOL(kmem_cache_alloc_lru); ++EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); + + void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t orig_size, +@@ -3491,7 +3491,7 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, + caller, orig_size); + } + +-void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) ++void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) + { + void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); + +@@ -3499,7 +3499,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) + + return ret; + } +-EXPORT_SYMBOL(kmem_cache_alloc_node); ++EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); + + static noinline void free_to_partial_list( + struct kmem_cache *s, struct slab *slab, +@@ -3779,6 +3779,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, + unsigned long addr) + { + memcg_slab_free_hook(s, slab, p, cnt); ++ alloc_tagging_slab_free_hook(s, slab, p, cnt); + /* + * With KASAN enabled slab_free_freelist_hook modifies the freelist + * to remove objects, whose reuse must be delayed. +@@ -4009,8 +4010,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, + #endif /* CONFIG_SLUB_TINY */ + + /* Note that interrupts must be enabled when calling this function. */ +-int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, +- void **p) ++int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, ++ void **p) + { + int i; + struct obj_cgroup *objcg = NULL; +@@ -4034,7 +4035,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + slab_want_init_on_alloc(flags, s), s->object_size); + return i; + } +-EXPORT_SYMBOL(kmem_cache_alloc_bulk); ++EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); + + + /* +@@ -5020,7 +5021,8 @@ void __init kmem_cache_init(void) + node_set(node, slab_nodes); + + create_boot_cache(kmem_cache_node, "kmem_cache_node", +- sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); ++ sizeof(struct kmem_cache_node), ++ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); + + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); + +@@ -5030,7 +5032,7 @@ void __init kmem_cache_init(void) + create_boot_cache(kmem_cache, "kmem_cache", + offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *), +- SLAB_HWCACHE_ALIGN, 0, 0); ++ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); + + kmem_cache = bootstrap(&boot_kmem_cache); + kmem_cache_node = bootstrap(&boot_kmem_cache_node); +diff --git a/mm/util.c b/mm/util.c +index dd12b9531..9d24b8870 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -115,7 +115,7 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) + EXPORT_SYMBOL(kstrndup); + + /** +- * kmemdup - duplicate region of memory ++ * kmemdup_noprof - duplicate region of memory + * + * @src: memory region to duplicate + * @len: memory region length +@@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup); + * Return: newly allocated copy of @src or %NULL in case of error, + * result is physically contiguous. Use kfree() to free. + */ +-void *kmemdup(const void *src, size_t len, gfp_t gfp) ++void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) + { + void *p; + +- p = kmalloc_track_caller(len, gfp); ++ p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); + if (p) + memcpy(p, src, len); + return p; + } +-EXPORT_SYMBOL(kmemdup); ++EXPORT_SYMBOL(kmemdup_noprof); + + /** + * kvmemdup - duplicate region of memory +@@ -564,7 +564,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, + EXPORT_SYMBOL(vm_mmap); + + /** +- * kvmalloc_node - attempt to allocate physically contiguous memory, but upon ++ * kvmalloc_node_noprof - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. +@@ -579,7 +579,7 @@ EXPORT_SYMBOL(vm_mmap); + * + * Return: pointer to the allocated memory of %NULL in case of failure + */ +-void *kvmalloc_node(size_t size, gfp_t flags, int node) ++void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) + { + gfp_t kmalloc_flags = flags; + void *ret; +@@ -601,7 +601,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) + kmalloc_flags &= ~__GFP_NOFAIL; + } + +- ret = kmalloc_node(size, kmalloc_flags, node); ++ ret = kmalloc_node_noprof(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page +@@ -626,11 +626,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) + * about the resulting pointer, and cannot play + * protection games. + */ +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + node, __builtin_return_address(0)); + } +-EXPORT_SYMBOL(kvmalloc_node); ++EXPORT_SYMBOL(kvmalloc_node_noprof); + + /** + * kvfree() - Free memory. +@@ -669,7 +669,7 @@ void kvfree_sensitive(const void *addr, size_t len) + } + EXPORT_SYMBOL(kvfree_sensitive); + +-void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) ++void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + { + void *newp; + +@@ -682,15 +682,15 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) + kvfree(p); + return newp; + } +-EXPORT_SYMBOL(kvrealloc); ++EXPORT_SYMBOL(kvrealloc_noprof); + + /** +- * __vmalloc_array - allocate memory for a virtually contiguous array. ++ * __vmalloc_array_noprof - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +-void *__vmalloc_array(size_t n, size_t size, gfp_t flags) ++void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) + { + size_t bytes; + +@@ -698,18 +698,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) + return NULL; + return __vmalloc(bytes, flags); + } +-EXPORT_SYMBOL(__vmalloc_array); ++EXPORT_SYMBOL(__vmalloc_array_noprof); + + /** +- * vmalloc_array - allocate memory for a virtually contiguous array. ++ * vmalloc_array_noprof - allocate memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +-void *vmalloc_array(size_t n, size_t size) ++void *vmalloc_array_noprof(size_t n, size_t size) + { + return __vmalloc_array(n, size, GFP_KERNEL); + } +-EXPORT_SYMBOL(vmalloc_array); ++EXPORT_SYMBOL(vmalloc_array_noprof); + + /** + * __vcalloc - allocate and zero memory for a virtually contiguous array. +@@ -717,22 +717,22 @@ EXPORT_SYMBOL(vmalloc_array); + * @size: element size. + * @flags: the type of memory to allocate (see kmalloc). + */ +-void *__vcalloc(size_t n, size_t size, gfp_t flags) ++void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) + { + return __vmalloc_array(n, size, flags | __GFP_ZERO); + } +-EXPORT_SYMBOL(__vcalloc); ++EXPORT_SYMBOL(__vcalloc_noprof); + + /** +- * vcalloc - allocate and zero memory for a virtually contiguous array. ++ * vcalloc_noprof - allocate and zero memory for a virtually contiguous array. + * @n: number of elements. + * @size: element size. + */ +-void *vcalloc(size_t n, size_t size) ++void *vcalloc_noprof(size_t n, size_t size) + { + return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); + } +-EXPORT_SYMBOL(vcalloc); ++EXPORT_SYMBOL(vcalloc_noprof); + + /* Neutral page->mapping pointer to address_space or anon_vma or other */ + void *page_rmapping(struct page *page) +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 1d13d7168..4c199cf9b 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -2971,12 +2971,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, + * but mempolicy wants to alloc memory by interleaving. + */ + if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) +- nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, ++ nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, + nr_pages_request, + pages + nr_allocated); + + else +- nr = alloc_pages_bulk_array_node(bulk_gfp, nid, ++ nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, + nr_pages_request, + pages + nr_allocated); + +@@ -3006,9 +3006,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, + break; + + if (nid == NUMA_NO_NODE) +- page = alloc_pages(alloc_gfp, order); ++ page = alloc_pages_noprof(alloc_gfp, order); + else +- page = alloc_pages_node(nid, alloc_gfp, order); ++ page = alloc_pages_node_noprof(nid, alloc_gfp, order); + if (unlikely(!page)) { + if (!nofail) + break; +@@ -3065,10 +3065,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + + /* Please note that the recursion is strictly bounded. */ + if (array_size > PAGE_SIZE) { +- area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, ++ area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, + area->caller); + } else { +- area->pages = kmalloc_node(array_size, nested_gfp, node); ++ area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); + } + + if (!area->pages) { +@@ -3151,7 +3151,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + } + + /** +- * __vmalloc_node_range - allocate virtually contiguous memory ++ * __vmalloc_node_range_noprof - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @start: vm area range start +@@ -3178,7 +3178,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, + * + * Return: the address of the area or %NULL on failure + */ +-void *__vmalloc_node_range(unsigned long size, unsigned long align, ++void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, + const void *caller) +@@ -3307,7 +3307,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, + } + + /** +- * __vmalloc_node - allocate virtually contiguous memory ++ * __vmalloc_node_noprof - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator +@@ -3325,10 +3325,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *__vmalloc_node(unsigned long size, unsigned long align, ++void *__vmalloc_node_noprof(unsigned long size, unsigned long align, + gfp_t gfp_mask, int node, const void *caller) + { +- return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, ++ return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, 0, node, caller); + } + /* +@@ -3337,15 +3337,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, + * than that. + */ + #ifdef CONFIG_TEST_VMALLOC_MODULE +-EXPORT_SYMBOL_GPL(__vmalloc_node); ++EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); + #endif + +-void *__vmalloc(unsigned long size, gfp_t gfp_mask) ++void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) + { +- return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, ++ return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(__vmalloc); ++EXPORT_SYMBOL(__vmalloc_noprof); + + /** + * vmalloc - allocate virtually contiguous memory +@@ -3359,12 +3359,12 @@ EXPORT_SYMBOL(__vmalloc); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc(unsigned long size) ++void *vmalloc_noprof(unsigned long size) + { +- return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, ++ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vmalloc); ++EXPORT_SYMBOL(vmalloc_noprof); + + /** + * vmalloc_huge - allocate virtually contiguous memory, allow huge pages +@@ -3378,16 +3378,16 @@ EXPORT_SYMBOL(vmalloc); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) ++void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) + { +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, + gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, + NUMA_NO_NODE, __builtin_return_address(0)); + } +-EXPORT_SYMBOL_GPL(vmalloc_huge); ++EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); + + /** +- * vzalloc - allocate virtually contiguous memory with zero fill ++ * vzalloc_noprof - allocate virtually contiguous memory with zero fill + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level +@@ -3399,12 +3399,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vzalloc(unsigned long size) ++void *vzalloc_noprof(unsigned long size) + { +- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, ++ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vzalloc); ++EXPORT_SYMBOL(vzalloc_noprof); + + /** + * vmalloc_user - allocate zeroed virtually contiguous memory for userspace +@@ -3415,17 +3415,17 @@ EXPORT_SYMBOL(vzalloc); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc_user(unsigned long size) ++void *vmalloc_user_noprof(unsigned long size) + { +- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, ++ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vmalloc_user); ++EXPORT_SYMBOL(vmalloc_user_noprof); + + /** +- * vmalloc_node - allocate memory on a specific node ++ * vmalloc_node_noprof - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * +@@ -3437,15 +3437,15 @@ EXPORT_SYMBOL(vmalloc_user); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc_node(unsigned long size, int node) ++void *vmalloc_node_noprof(unsigned long size, int node) + { +- return __vmalloc_node(size, 1, GFP_KERNEL, node, ++ return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vmalloc_node); ++EXPORT_SYMBOL(vmalloc_node_noprof); + + /** +- * vzalloc_node - allocate memory on a specific node with zero fill ++ * vzalloc_node_noprof - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * +@@ -3455,12 +3455,12 @@ EXPORT_SYMBOL(vmalloc_node); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vzalloc_node(unsigned long size, int node) ++void *vzalloc_node_noprof(unsigned long size, int node) + { +- return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, ++ return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vzalloc_node); ++EXPORT_SYMBOL(vzalloc_node_noprof); + + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) + #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) +@@ -3475,7 +3475,7 @@ EXPORT_SYMBOL(vzalloc_node); + #endif + + /** +- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) ++ * vmalloc_32_noprof - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size + * + * Allocate enough 32bit PA addressable pages to cover @size from the +@@ -3483,15 +3483,15 @@ EXPORT_SYMBOL(vzalloc_node); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc_32(unsigned long size) ++void *vmalloc_32_noprof(unsigned long size) + { +- return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, ++ return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vmalloc_32); ++EXPORT_SYMBOL(vmalloc_32_noprof); + + /** +- * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory ++ * vmalloc_32_user_noprof - allocate zeroed virtually contiguous 32bit memory + * @size: allocation size + * + * The resulting memory area is 32bit addressable and zeroed so it can be +@@ -3499,14 +3499,14 @@ EXPORT_SYMBOL(vmalloc_32); + * + * Return: pointer to the allocated memory or %NULL on error + */ +-void *vmalloc_32_user(unsigned long size) ++void *vmalloc_32_user_noprof(unsigned long size) + { +- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, ++ return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); + } +-EXPORT_SYMBOL(vmalloc_32_user); ++EXPORT_SYMBOL(vmalloc_32_user_noprof); + + /* + * Atomically zero bytes in the iterator. diff --git a/mm/vmscan.c b/mm/vmscan.c index d6802821d..a22f36ec7 100644 --- a/mm/vmscan.c @@ -96340,7 +103237,7 @@ index 7778cc97a..5341736f2 100644 +# eval_vars(X_,a/b/c) = $(X_a_b_c) $(X_a_b) $(X_a) +eval_vars = $(foreach var,$(call flatten_dirs,$(2)),$($(1)$(var))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib -index a3ec7265f..1843da7d3 100644 +index 100a386fc..1f106c71e 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -148,7 +148,7 @@ _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds) @@ -96352,5 +103249,67 @@ index a3ec7265f..1843da7d3 100644 $(CFLAGS_GCOV)) endif +diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c +index 0d2db4117..7b7dbeb5b 100644 +--- a/scripts/kallsyms.c ++++ b/scripts/kallsyms.c +@@ -203,6 +203,11 @@ static int symbol_in_range(const struct sym_entry *s, + return 0; + } + ++static bool string_starts_with(const char *s, const char *prefix) ++{ ++ return strncmp(s, prefix, strlen(prefix)) == 0; ++} ++ + static int symbol_valid(const struct sym_entry *s) + { + const char *name = sym_name(s); +@@ -210,6 +215,14 @@ static int symbol_valid(const struct sym_entry *s) + /* if --all-symbols is not specified, then symbols outside the text + * and inittext sections are discarded */ + if (!all_symbols) { ++ /* ++ * Symbols starting with __start and __stop are used to denote ++ * section boundaries, and should always be included: ++ */ ++ if (string_starts_with(name, "__start_") || ++ string_starts_with(name, "__stop_")) ++ return 1; ++ + if (symbol_in_range(s, text_ranges, + ARRAY_SIZE(text_ranges)) == 0) + return 0; +diff --git a/scripts/module.lds.S b/scripts/module.lds.S +index bf5bcf283..45c67a099 100644 +--- a/scripts/module.lds.S ++++ b/scripts/module.lds.S +@@ -9,6 +9,8 @@ + #define DISCARD_EH_FRAME *(.eh_frame) + #endif + ++#include ++ + SECTIONS { + /DISCARD/ : { + *(.discard) +@@ -47,12 +49,17 @@ SECTIONS { + .data : { + *(.data .data.[0-9a-zA-Z_]*) + *(.data..L*) ++ CODETAG_SECTIONS() + } + + .rodata : { + *(.rodata .rodata.[0-9a-zA-Z_]*) + *(.rodata..L*) + } ++#else ++ .data : { ++ CODETAG_SECTIONS() ++ } + #endif + } + -- 2.41.0.159.g0bfa463d37 diff --git a/scripts/build.sh b/scripts/build.sh index 91d1028..643350c 100755 --- a/scripts/build.sh +++ b/scripts/build.sh @@ -2,4 +2,4 @@ echo "Pika Kernel - Building" -make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos KDEB_PKGVERSION=$(make kernelversion)-2 +make -j`nproc` bindeb-pkg LOCALVERSION=-pikaos KDEB_PKGVERSION=$(make kernelversion)-23 diff --git a/scripts/source.sh b/scripts/source.sh index ecf76a6..1547b7d 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.4.1.tar.gz -tar -xf ./linux-6.4.1.tar.gz +wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.4.3.tar.gz +tar -xf ./linux-6.4.3.tar.gz -cd linux-6.4.1 +cd linux-6.4.3