diff --git a/config b/config
index 66566be..80519ae 100644
--- a/config
+++ b/config
@@ -151,7 +151,6 @@ CONFIG_SCHED_CORE=y
 #
 # CPU/Task time and stats accounting
 #
-CONFIG_TICK_CPU_ACCOUNTING=n
 CONFIG_VIRT_CPU_ACCOUNTING=y
 CONFIG_VIRT_CPU_ACCOUNTING_GEN=y
 CONFIG_IRQ_TIME_ACCOUNTING=y
@@ -542,7 +541,7 @@ CONFIG_X86_INTEL_TSX_MODE_AUTO=y
 CONFIG_X86_SGX=y
 CONFIG_EFI=y
 CONFIG_EFI_STUB=y
-# CONFIG_EFI_HANDOVER_PROTOCOL is not set
+CONFIG_EFI_HANDOVER_PROTOCOL=y
 CONFIG_EFI_MIXED=y
 # CONFIG_EFI_FAKE_MEMMAP is not set
 CONFIG_EFI_RUNTIME_MAP=y
@@ -3409,6 +3408,7 @@ CONFIG_MICROSOFT_MANA=m
 CONFIG_NET_VENDOR_MYRI=y
 CONFIG_MYRI10GE=m
 CONFIG_MYRI10GE_DCA=y
+CONFIG_FEALNX=m
 CONFIG_NET_VENDOR_NI=y
 CONFIG_NI_XGE_MANAGEMENT_ENET=m
 CONFIG_NET_VENDOR_NATSEMI=y
@@ -5880,6 +5880,7 @@ CONFIG_VIDEO_V4L2_SUBDEV_API=y
 # CONFIG_VIDEO_ADV_DEBUG is not set
 # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
 CONFIG_VIDEO_TUNER=m
+CONFIG_V4L2_LOOPBACK=m
 CONFIG_V4L2_MEM2MEM_DEV=m
 CONFIG_V4L2_FLASH_LED_CLASS=m
 CONFIG_V4L2_FWNODE=m
@@ -7105,6 +7106,7 @@ CONFIG_SND_HDA_CODEC_SI3054=m
 CONFIG_SND_HDA_GENERIC=m
 CONFIG_SND_HDA_POWER_SAVE_DEFAULT=1
 CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM=y
+# CONFIG_SND_HDA_CTL_DEV_ID is not set
 # end of HD-Audio
 
 CONFIG_SND_HDA_CORE=m
@@ -9002,7 +9004,6 @@ CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y
 CONFIG_THINKPAD_ACPI_VIDEO=y
 CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y
 CONFIG_THINKPAD_LMI=m
-CONFIG_LEGION_LAPTOP=m
 CONFIG_INTEL_ATOMISP2_PDX86=y
 CONFIG_INTEL_ATOMISP2_LED=m
 CONFIG_INTEL_IFS=m
@@ -11441,4 +11442,4 @@ CONFIG_MEMTEST=y
 # Rust hacking
 #
 # end of Rust hacking
-# end of Kernel hacking
+# end of Kernel hacking
\ No newline at end of file
diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch
index cf8faa9..c1c40d6 100644
--- a/patches/0001-cachy-all.patch
+++ b/patches/0001-cachy-all.patch
@@ -1,7 +1,7 @@
-From 349ff8d17d3501ab92ba911463a539cdaa50faa7 Mon Sep 17 00:00:00 2001
+From d7322fe0d4d120555d7dd3c2a6167f7f726b8738 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 15 Jan 2023 16:50:23 +0100
-Subject: [PATCH 01/15] bbr2
+Date: Fri, 10 Mar 2023 17:59:47 +0100
+Subject: [PATCH 01/16] bbr2
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -3281,18 +3281,18 @@ index cb79127f45c3..70e4de876a7f 100644
  	event = icsk->icsk_pending;
  
 -- 
-2.39.2
+2.40.0.rc2
 
-From 867183d5c6eadbbff94e6b03e03e9959787d47a6 Mon Sep 17 00:00:00 2001
+From 87439b08ac56036539528efb6da691914f41ca76 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 13 Feb 2023 09:23:53 +0100
-Subject: [PATCH 02/15] bfq
+Date: Fri, 10 Mar 2023 18:00:04 +0100
+Subject: [PATCH 02/16] bfq
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
  block/bfq-cgroup.c    | 101 ++++---
- block/bfq-iosched.c   | 629 ++++++++++++++++++++++++++++--------------
- block/bfq-iosched.h   | 144 +++++++---
+ block/bfq-iosched.c   | 637 ++++++++++++++++++++++++++++--------------
+ block/bfq-iosched.h   | 144 ++++++++--
  block/bfq-wf2q.c      |   2 +-
  block/blk-cgroup.c    | 122 ++++----
  block/blk-cgroup.h    |  10 +-
@@ -3301,7 +3301,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  block/blk-rq-qos.h    |   2 +-
  block/blk-throttle.c  |  16 +-
  block/blk.h           |   6 -
- 11 files changed, 743 insertions(+), 386 deletions(-)
+ 11 files changed, 747 insertions(+), 390 deletions(-)
 
 diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
 index 0fbde0fc0628..59929dfd559b 100644
@@ -3448,7 +3448,7 @@ index 0fbde0fc0628..59929dfd559b 100644
  }
  
 diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
-index 380e9bda2e57..c330ff5fde4c 100644
+index 380e9bda2e57..aa644973d260 100644
 --- a/block/bfq-iosched.c
 +++ b/block/bfq-iosched.c
 @@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600;
@@ -3756,7 +3756,7 @@ index 380e9bda2e57..c330ff5fde4c 100644
  	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
  		bfq_bfqq_end_wr(bfqq);
  	bfq_end_wr_async(bfqd);
-@@ -2794,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+@@ -2794,6 +2847,40 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
  static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
  					     struct bfq_queue *bfqq);
  
@@ -3767,11 +3767,11 @@ index 380e9bda2e57..c330ff5fde4c 100644
 +{
 +	int proc_ref = min(bfqq_process_refs(bfqq),
 +			   bfqq_process_refs(stable_merge_bfqq));
-+	struct bfq_queue *new_bfqq;
++	struct bfq_queue *new_bfqq = NULL;
 +
-+	if (idling_boosts_thr_without_issues(bfqd, bfqq) ||
-+	    proc_ref == 0)
-+		return NULL;
++	bfqq_data->stable_merge_bfqq = NULL;
++	if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0)
++		goto out;
 +
 +	/* next function will take at least one ref */
 +	new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq);
@@ -3786,13 +3786,18 @@ index 380e9bda2e57..c330ff5fde4c 100644
 +			new_bfqq_data->stably_merged = true;
 +		}
 +	}
++
++out:
++	/* deschedule stable merge, because done or aborted here */
++	bfq_put_stable_ref(stable_merge_bfqq);
++
 +	return new_bfqq;
 +}
 +
  /*
   * Attempt to schedule a merge of bfqq with the currently in-service
   * queue or with a close queue among the scheduled queues.  Return
-@@ -2819,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+@@ -2819,6 +2906,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  		     void *io_struct, bool request, struct bfq_io_cq *bic)
  {
  	struct bfq_queue *in_service_bfqq, *new_bfqq;
@@ -3801,7 +3806,7 @@ index 380e9bda2e57..c330ff5fde4c 100644
  
  	/* if a merge has already been setup, then proceed with that first */
  	if (bfqq->new_bfqq)
-@@ -2840,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+@@ -2840,37 +2929,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
  		 * stable merging) also if bic is associated with a
  		 * sync queue, but this bfqq is async
  		 */
@@ -3816,11 +3821,10 @@ index 380e9bda2e57..c330ff5fde4c 100644
 -				bic->stable_merge_bfqq;
 -			int proc_ref = min(bfqq_process_refs(bfqq),
 -					   bfqq_process_refs(stable_merge_bfqq));
-+				bfqq_data->stable_merge_bfqq;
- 
- 			/* deschedule stable merge, because done or aborted here */
- 			bfq_put_stable_ref(stable_merge_bfqq);
- 
+-
+-			/* deschedule stable merge, because done or aborted here */
+-			bfq_put_stable_ref(stable_merge_bfqq);
+-
 -			bic->stable_merge_bfqq = NULL;
 -
 -			if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
@@ -3838,7 +3842,7 @@ index 380e9bda2e57..c330ff5fde4c 100644
 -				return new_bfqq;
 -			} else
 -				return NULL;
-+			bfqq_data->stable_merge_bfqq = NULL;
++				bfqq_data->stable_merge_bfqq;
 +
 +			return bfq_setup_stable_merge(bfqd, bfqq,
 +						      stable_merge_bfqq,
@@ -4032,10 +4036,10 @@ index 380e9bda2e57..c330ff5fde4c 100644
  			}
  		}
 +	}
-+
-+	return NULL;
-+}
-+
+ 
+ 	return NULL;
+ }
+ 
 +static struct bfq_queue *
 +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
 +{
@@ -4052,10 +4056,10 @@ index 380e9bda2e57..c330ff5fde4c 100644
 +			return bfqq;
 +		}
 +	}
- 
- 	return NULL;
- }
- 
++
++	return NULL;
++}
++
 +/*
 + * Perform a linear scan of each actuator, until an actuator is found
 + * for which the following three conditions hold: the load of the
@@ -5250,10 +5254,10 @@ index 1e94e404eaa8..fe09e8b4c2a8 100644
  /**
   * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 diff --git a/block/blk-iocost.c b/block/blk-iocost.c
-index 6955605629e4..22a3639a7a05 100644
+index ec7219caea16..c31d57e29bf8 100644
 --- a/block/blk-iocost.c
 +++ b/block/blk-iocost.c
-@@ -3091,9 +3091,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+@@ -3096,9 +3096,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
  		return nbytes;
  	}
  
@@ -5267,7 +5271,7 @@ index 6955605629e4..22a3639a7a05 100644
  
  	iocg = blkg_to_iocg(ctx.blkg);
  
-@@ -3112,12 +3114,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+@@ -3117,12 +3119,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
  	weight_updated(iocg, &now);
  	spin_unlock(&iocg->ioc->lock);
  
@@ -5285,7 +5289,7 @@ index 6955605629e4..22a3639a7a05 100644
  }
  
  static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
-@@ -3172,19 +3176,22 @@ static const match_table_t qos_tokens = {
+@@ -3177,19 +3181,22 @@ static const match_table_t qos_tokens = {
  static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  			     size_t nbytes, loff_t off)
  {
@@ -5314,7 +5318,7 @@ index 6955605629e4..22a3639a7a05 100644
  	ioc = q_to_ioc(disk->queue);
  	if (!ioc) {
  		ret = blk_iocost_init(disk);
-@@ -3201,7 +3208,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+@@ -3206,7 +3213,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  	enable = ioc->enabled;
  	user = ioc->user_qos_params;
  
@@ -5323,7 +5327,7 @@ index 6955605629e4..22a3639a7a05 100644
  		substring_t args[MAX_OPT_ARGS];
  		char buf[32];
  		int tok;
-@@ -3290,7 +3297,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+@@ -3295,7 +3302,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  	blk_mq_unquiesce_queue(disk->queue);
  	blk_mq_unfreeze_queue(disk->queue);
  
@@ -5332,7 +5336,7 @@ index 6955605629e4..22a3639a7a05 100644
  	return nbytes;
  einval:
  	spin_unlock_irq(&ioc->lock);
-@@ -3300,7 +3307,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+@@ -3305,7 +3312,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  
  	ret = -EINVAL;
  err:
@@ -5341,7 +5345,7 @@ index 6955605629e4..22a3639a7a05 100644
  	return ret;
  }
  
-@@ -3351,22 +3358,25 @@ static const match_table_t i_lcoef_tokens = {
+@@ -3356,22 +3363,25 @@ static const match_table_t i_lcoef_tokens = {
  static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  				    size_t nbytes, loff_t off)
  {
@@ -5374,7 +5378,7 @@ index 6955605629e4..22a3639a7a05 100644
  		if (ret)
  			goto err;
  		ioc = q_to_ioc(q);
-@@ -3379,7 +3389,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+@@ -3384,7 +3394,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
  	user = ioc->user_cost_model;
  
@@ -5383,7 +5387,7 @@ index 6955605629e4..22a3639a7a05 100644
  		substring_t args[MAX_OPT_ARGS];
  		char buf[32];
  		int tok;
-@@ -3426,7 +3436,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+@@ -3431,7 +3441,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  	blk_mq_unquiesce_queue(q);
  	blk_mq_unfreeze_queue(q);
  
@@ -5392,7 +5396,7 @@ index 6955605629e4..22a3639a7a05 100644
  	return nbytes;
  
  einval:
-@@ -3437,7 +3447,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+@@ -3442,7 +3452,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  
  	ret = -EINVAL;
  err:
@@ -5561,12 +5565,12 @@ index 4c3b3325219a..78f1706cddca 100644
  void disk_free_zone_bitmaps(struct gendisk *disk);
  void disk_clear_zone_settings(struct gendisk *disk);
 -- 
-2.39.2
+2.40.0.rc2
 
-From 6f60a56132a8b4f7d72e8b720cd16e76b4afbe0d Mon Sep 17 00:00:00 2001
+From e44295cea72d5cefc97900011495f89f000873ac Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 13 Feb 2023 11:26:20 +0100
-Subject: [PATCH 03/15] bitmap
+Subject: [PATCH 03/16] bitmap
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -6912,12 +6916,12 @@ index bb0ee80526b2..8c04254c5284 100644
  #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1)
  	int w, bit;
 -- 
-2.39.2
+2.40.0.rc2
 
-From 6410241f76741f457037edfe776d47fff19f7d8c Mon Sep 17 00:00:00 2001
+From 5d1ae6ec70d7e64ac75501503e3dcf229e0942fb Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 25 Feb 2023 14:40:50 +0100
-Subject: [PATCH 04/15] cachy
+Date: Sat, 11 Mar 2023 14:42:34 +0100
+Subject: [PATCH 04/16] cachy
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -6950,7 +6954,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  drivers/i2c/busses/i2c-nct6775.c              | 647 ++++++++++++++++++
  drivers/i2c/busses/i2c-piix4.c                |   4 +-
  drivers/md/dm-crypt.c                         |   5 +
- drivers/pci/quirks.c                          | 101 +++
+ drivers/pci/quirks.c                          | 103 ++-
  drivers/platform/x86/Kconfig                  |  14 +
  drivers/platform/x86/Makefile                 |   3 +
  drivers/platform/x86/steamdeck.c              | 523 ++++++++++++++
@@ -6982,7 +6986,7 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  net/ipv4/tcp_ipv4.c                           |   2 +
  scripts/Makefile.lib                          |  13 +-
  scripts/Makefile.modinst                      |   7 +-
- 61 files changed, 2200 insertions(+), 74 deletions(-)
+ 61 files changed, 2200 insertions(+), 76 deletions(-)
  create mode 100644 arch/x86/Makefile.postlink
  create mode 100644 drivers/i2c/busses/i2c-nct6775.c
  create mode 100644 drivers/platform/x86/steamdeck.c
@@ -7041,7 +7045,7 @@ index 352ff53a2306..7c210744d84c 100644
  vmlinuz
  voffset.h
 diff --git a/Makefile b/Makefile
-index f26824f367a9..0fe8877f9616 100644
+index 1a1d63f2a9ed..9caed88238ab 100644
 --- a/Makefile
 +++ b/Makefile
 @@ -834,6 +834,9 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning, address-of-packed-member)
@@ -8734,7 +8738,7 @@ index 2653516bcdef..973fe8f80051 100644
  	if (ret < 0)
  		goto bad;
 diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index 285acc4aaccc..492e88a99c07 100644
+index 494fa46f5767..bcdfc072cbfb 100644
 --- a/drivers/pci/quirks.c
 +++ b/drivers/pci/quirks.c
 @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
@@ -8844,10 +8848,12 @@ index 285acc4aaccc..492e88a99c07 100644
  /*
   * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be
   * prevented for those affected devices.
-@@ -4980,6 +5080,7 @@ static const struct pci_dev_acs_enabled {
+@@ -5000,8 +5100,7 @@ static const struct pci_dev_acs_enabled {
  	{ PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs },
  	/* Zhaoxin Root/Downstream Ports */
  	{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
+-	/* Wangxun nics */
+-	{ PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs },
 +	{ PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
  	{ 0 }
  };
@@ -9828,7 +9834,7 @@ index 0f8736991427..86a988c830ef 100644
  #ifdef CONFIG_NUMA_BALANCING
  /* Restrict the NUMA promotion throughput (MB/s) for each target node. */
 diff --git a/kernel/sysctl.c b/kernel/sysctl.c
-index 137d4abe3eda..98e2d9cc8491 100644
+index 1c240d2c99bc..98e1a7472fd2 100644
 --- a/kernel/sysctl.c
 +++ b/kernel/sysctl.c
 @@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals);
@@ -9841,7 +9847,7 @@ index 137d4abe3eda..98e2d9cc8491 100644
  
  
  static const int ngroups_max = NGROUPS_MAX;
-@@ -1640,6 +1643,15 @@ static struct ctl_table kern_table[] = {
+@@ -1645,6 +1648,15 @@ static struct ctl_table kern_table[] = {
  		.mode		= 0644,
  		.proc_handler	= proc_dointvec,
  	},
@@ -10229,37 +10235,50 @@ index 4815a8e32227..6a3c36713045 100644
  $(dst)/%.ko.gz: $(dst)/%.ko FORCE
  	$(call cmd,gzip)
 -- 
-2.39.2
+2.40.0.rc2
 
-From 993543a17f59dc2ef259242455c5d2d0810a76df Mon Sep 17 00:00:00 2001
+From 0e45a02aaaa398cc0465a407331459f28cdb1ae9 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sun, 15 Jan 2023 16:51:11 +0100
-Subject: [PATCH 05/15] clr
+Date: Fri, 10 Mar 2023 18:00:48 +0100
+Subject: [PATCH 05/16] clr
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- arch/x86/kernel/tsc.c                    |  3 ++
- arch/x86/mm/fault.c                      |  4 +-
- drivers/cpufreq/intel_pstate.c           |  7 ++++
- drivers/idle/intel_idle.c                | 50 ++++++++++++------------
- drivers/input/serio/i8042.c              | 10 ++---
- drivers/net/dummy.c                      |  2 +-
- drivers/pci/pci.c                        |  2 +-
- drivers/powercap/intel_rapl_common.c     |  2 +-
- drivers/thermal/intel/intel_powerclamp.c | 10 +++++
- fs/xattr.c                               | 15 +++----
- include/linux/jbd2.h                     |  2 +-
- include/linux/wait.h                     |  2 +
- include/uapi/linux/if_bonding.h          |  2 +-
- init/do_mounts.c                         | 16 +++++++-
- kernel/locking/rwsem.c                   |  4 +-
- kernel/sched/wait.c                      | 24 ++++++++++++
- kernel/watchdog.c                        |  2 +-
- lib/raid6/algos.c                        |  4 +-
- mm/ksm.c                                 | 11 ++++--
- net/ipv4/inet_connection_sock.c          |  2 +-
- net/ipv4/tcp.c                           |  4 +-
- 21 files changed, 123 insertions(+), 55 deletions(-)
+ arch/x86/kernel/tsc.c                    |   3 +
+ arch/x86/mm/fault.c                      |   4 +-
+ drivers/cpufreq/intel_pstate.c           |   7 +
+ drivers/idle/intel_idle.c                |  50 ++--
+ drivers/input/serio/i8042.c              |  10 +-
+ drivers/net/dummy.c                      |   2 +-
+ drivers/pci/pci.c                        |   2 +-
+ drivers/powercap/intel_rapl_common.c     |   2 +-
+ drivers/thermal/intel/intel_powerclamp.c |  10 +
+ fs/xattr.c                               |  15 +-
+ include/linux/jbd2.h                     |   2 +-
+ include/linux/rcuref.h                   |  89 +++++++
+ include/linux/types.h                    |   6 +
+ include/linux/wait.h                     |   2 +
+ include/net/dst.h                        |  21 +-
+ include/net/sock.h                       |   2 +-
+ include/uapi/linux/if_bonding.h          |   2 +-
+ init/do_mounts.c                         |  16 +-
+ kernel/locking/rwsem.c                   |   4 +-
+ kernel/sched/wait.c                      |  24 ++
+ kernel/watchdog.c                        |   2 +-
+ lib/Makefile                             |   2 +-
+ lib/raid6/algos.c                        |   4 +-
+ lib/rcuref.c                             | 311 +++++++++++++++++++++++
+ mm/ksm.c                                 |  11 +-
+ net/bridge/br_nf_core.c                  |   2 +-
+ net/core/dst.c                           |  26 +-
+ net/core/rtnetlink.c                     |   2 +-
+ net/ipv4/inet_connection_sock.c          |   2 +-
+ net/ipv4/tcp.c                           |   4 +-
+ net/ipv6/route.c                         |   6 +-
+ net/netfilter/ipvs/ip_vs_xmit.c          |   4 +-
+ 32 files changed, 559 insertions(+), 90 deletions(-)
+ create mode 100644 include/linux/rcuref.h
+ create mode 100644 lib/rcuref.c
 
 diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
 index a78e73da4a74..bab8a98080cf 100644
@@ -10310,10 +10329,10 @@ index fd73d6d2b808..0c0071ab3966 100644
  	if (max_highest_perf <= min_highest_perf) {
  		if (cppc_perf.highest_perf > max_highest_perf)
 diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
-index cfeb24d40d37..8d1945afa973 100644
+index f060ac7376e6..1cd277c8f77f 100644
 --- a/drivers/idle/intel_idle.c
 +++ b/drivers/idle/intel_idle.c
-@@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -572,7 +572,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 10,
@@ -10322,7 +10341,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -580,7 +580,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x10",
  		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 33,
@@ -10331,7 +10350,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x20",
  		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 133,
@@ -10340,7 +10359,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -596,7 +596,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x32",
  		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 166,
@@ -10349,7 +10368,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -604,7 +604,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x40",
  		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 300,
@@ -10358,7 +10377,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -612,7 +612,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x50",
  		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 600,
@@ -10367,7 +10386,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
+@@ -620,7 +620,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
  		.desc = "MWAIT 0x60",
  		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 2600,
@@ -10376,7 +10395,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -640,7 +640,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 10,
@@ -10385,7 +10404,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -648,7 +648,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x10",
  		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 40,
@@ -10394,7 +10413,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -656,7 +656,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x20",
  		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 133,
@@ -10403,7 +10422,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -664,7 +664,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x32",
  		.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 166,
@@ -10412,7 +10431,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -672,7 +672,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x40",
  		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 300,
@@ -10421,7 +10440,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -680,7 +680,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x50",
  		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 600,
@@ -10430,7 +10449,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
+@@ -688,7 +688,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
  		.desc = "MWAIT 0x60",
  		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 2600,
@@ -10439,7 +10458,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -709,7 +709,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 10,
@@ -10448,7 +10467,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -717,7 +717,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x10",
  		.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 70,
@@ -10457,7 +10476,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -725,7 +725,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x20",
  		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
  		.exit_latency = 85,
@@ -10466,7 +10485,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -733,7 +733,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x33",
  		.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
  		.exit_latency = 124,
@@ -10475,7 +10494,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -741,7 +741,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x40",
  		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
  		.exit_latency = 200,
@@ -10484,7 +10503,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -749,7 +749,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x50",
  		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
  		.exit_latency = 480,
@@ -10493,7 +10512,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
+@@ -757,7 +757,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
  		.desc = "MWAIT 0x60",
  		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS,
  		.exit_latency = 890,
@@ -10502,7 +10521,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
+@@ -778,7 +778,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 10,
@@ -10511,7 +10530,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
+@@ -807,7 +807,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 4,
@@ -10520,7 +10539,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
+@@ -815,7 +815,7 @@ static struct cpuidle_state icx_cstates[] __initdata = {
  		.desc = "MWAIT 0x20",
  		.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
  		.exit_latency = 170,
@@ -10529,7 +10548,7 @@ index cfeb24d40d37..8d1945afa973 100644
  		.enter = &intel_idle,
  		.enter_s2idle = intel_idle_s2idle, },
  	{
-@@ -987,7 +987,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
+@@ -981,7 +981,7 @@ static struct cpuidle_state spr_cstates[] __initdata = {
  		.desc = "MWAIT 0x01",
  		.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
  		.exit_latency = 2,
@@ -10601,7 +10620,7 @@ index c4b1b0aa438a..06b00f7a8eab 100644
  /* fake multicast ability */
  static void set_multicast_list(struct net_device *dev)
 diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
-index 5641786bd020..0ef504e909db 100644
+index 7a67611dc5f4..48b350fe09d8 100644
 --- a/drivers/pci/pci.c
 +++ b/drivers/pci/pci.c
 @@ -62,7 +62,7 @@ struct pci_pme_device {
@@ -10627,10 +10646,10 @@ index 26d00b1853b4..3e239d6548b5 100644
  
  		return -ENODEV;
 diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c
-index b80e25ec1261..187b4ee6e9f5 100644
+index 2f4cbfdf26a0..2d297a1cfa34 100644
 --- a/drivers/thermal/intel/intel_powerclamp.c
 +++ b/drivers/thermal/intel/intel_powerclamp.c
-@@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
+@@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = {
  	.set_cur_state = powerclamp_set_cur_state,
  };
  
@@ -10642,7 +10661,7 @@ index b80e25ec1261..187b4ee6e9f5 100644
  static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
  	X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
  	{}
-@@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
+@@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
  static int __init powerclamp_probe(void)
  {
  
@@ -10696,6 +10715,118 @@ index 2170e0cc279d..e8fa79f5bb34 100644
  
  #ifdef CONFIG_JBD2_DEBUG
  /*
+diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h
+new file mode 100644
+index 000000000000..57ffb3c02ace
+--- /dev/null
++++ b/include/linux/rcuref.h
+@@ -0,0 +1,89 @@
++/* SPDX-License-Identifier: GPL-2.0-only */
++#ifndef _LINUX_RCUREF_H
++#define _LINUX_RCUREF_H
++
++#include <linux/atomic.h>
++#include <linux/bug.h>
++#include <linux/limits.h>
++#include <linux/lockdep.h>
++#include <linux/preempt.h>
++#include <linux/rcupdate.h>
++
++#define RCUREF_NOREF		0x00000000
++#define RCUREF_ONEREF		0x00000001
++#define RCUREF_MAXREF		0x7FFFFFFF
++#define RCUREF_SATURATED	0xA0000000
++#define RCUREF_RELEASED		0xC0000000
++#define RCUREF_DEAD		0xE0000000
++
++/**
++ * rcuref_init - Initialize a rcuref reference count with the given reference count
++ * @ref:	Pointer to the reference count
++ * @cnt:	The initial reference count typically '1'
++ */
++static inline void rcuref_init(rcuref_t *ref, unsigned int cnt)
++{
++	atomic_set(&ref->refcnt, cnt);
++}
++
++/**
++ * rcuref_read - Read the number of held reference counts of a rcuref
++ * @ref:	Pointer to the reference count
++ *
++ * Return: The number of held references (0 ... N)
++ */
++static inline unsigned int rcuref_read(rcuref_t *ref)
++{
++	unsigned int c = atomic_read(&ref->refcnt);
++
++	/* Return 0 if within the DEAD zone. */
++	return c >= RCUREF_RELEASED ? 0 : c;
++}
++
++extern __must_check bool rcuref_get_slowpath(rcuref_t *ref, unsigned int new);
++
++/**
++ * rcuref_get - Acquire one reference on a rcuref reference count
++ * @ref:	Pointer to the reference count
++ *
++ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF.
++ *
++ * Provides no memory ordering, it is assumed the caller has guaranteed the
++ * object memory to be stable (RCU, etc.). It does provide a control dependency
++ * and thereby orders future stores. See documentation in lib/rcuref.c
++ *
++ * Return:
++ *	False if the attempt to acquire a reference failed. This happens
++ *	when the last reference has been put already
++ *
++ *	True if a reference was successfully acquired
++ */
++static inline __must_check bool rcuref_get(rcuref_t *ref)
++{
++	/*
++	 * Unconditionally increase the reference count. The saturation and
++	 * dead zones provide enough tolerance for this.
++	 */
++	unsigned int old = atomic_fetch_add_relaxed(1, &ref->refcnt);
++
++	/*
++	 * If the old value is less than RCUREF_MAXREF, this is a valid
++	 * reference.
++	 *
++	 * In case the original value was RCUREF_NOREF the above
++	 * unconditional increment raced with a concurrent put() operation
++	 * dropping the last reference. That racing put() operation
++	 * subsequently fails to mark the reference count dead because the
++	 * count is now elevated again and the concurrent caller is
++	 * therefore not allowed to deconstruct the object.
++	 */
++	if (likely(old < RCUREF_MAXREF))
++		return true;
++
++	/* Handle the cases inside the saturation and dead zones */
++	return rcuref_get_slowpath(ref, old);
++}
++
++extern __must_check bool rcuref_put(rcuref_t *ref);
++
++#endif
+diff --git a/include/linux/types.h b/include/linux/types.h
+index ea8cf60a8a79..419baa980529 100644
+--- a/include/linux/types.h
++++ b/include/linux/types.h
+@@ -175,6 +175,12 @@ typedef struct {
+ } atomic64_t;
+ #endif
+ 
++typedef struct {
++	atomic_t refcnt;
++} rcuref_t;
++
++#define RCUREF_INIT(i)	{ .refcnt = ATOMIC_INIT(i) }
++
+ struct list_head {
+ 	struct list_head *next, *prev;
+ };
 diff --git a/include/linux/wait.h b/include/linux/wait.h
 index a0307b516b09..edc21128f387 100644
 --- a/include/linux/wait.h
@@ -10716,6 +10847,82 @@ index a0307b516b09..edc21128f387 100644
  long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state);
  void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry);
  long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout);
+diff --git a/include/net/dst.h b/include/net/dst.h
+index d67fda89cd0f..0909a3306902 100644
+--- a/include/net/dst.h
++++ b/include/net/dst.h
+@@ -16,6 +16,7 @@
+ #include <linux/bug.h>
+ #include <linux/jiffies.h>
+ #include <linux/refcount.h>
++#include <linux/rcuref.h>
+ #include <net/neighbour.h>
+ #include <asm/processor.h>
+ #include <linux/indirect_call_wrapper.h>
+@@ -65,19 +66,29 @@ struct dst_entry {
+ 	 * input/output/ops or performance tanks badly
+ 	 */
+ #ifdef CONFIG_64BIT
+-	atomic_t		__refcnt;	/* 64-bit offset 64 */
++	rcuref_t		__refcnt;	/* 64-bit offset 64 */
+ #endif
+ 	int			__use;
+ 	unsigned long		lastuse;
+-	struct lwtunnel_state   *lwtstate;
+ 	struct rcu_head		rcu_head;
+ 	short			error;
+ 	short			__pad;
+ 	__u32			tclassid;
+ #ifndef CONFIG_64BIT
+-	atomic_t		__refcnt;	/* 32-bit offset 64 */
++	struct lwtunnel_state   *lwtstate;
++	rcuref_t		__refcnt;	/* 32-bit offset 64 */
+ #endif
+ 	netdevice_tracker	dev_tracker;
++#ifdef CONFIG_64BIT
++	/*
++	 * Ensure that lwtstate is not in the same cache line as __refcnt,
++	 * because that would lead to false sharing under high contention
++	 * of __refcnt. This also ensures that rtable::rt_genid is not
++	 * sharing the same cache-line.
++	 */
++	int			pad2[6];
++	struct lwtunnel_state   *lwtstate;
++#endif
+ };
+ 
+ struct dst_metrics {
+@@ -228,7 +239,7 @@ static inline void dst_hold(struct dst_entry *dst)
+ 	 * the placement of __refcnt in struct dst_entry
+ 	 */
+ 	BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
+-	WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
++	WARN_ON(!rcuref_get(&dst->__refcnt));
+ }
+ 
+ static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
+@@ -292,7 +303,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb
+  */
+ static inline bool dst_hold_safe(struct dst_entry *dst)
+ {
+-	return atomic_inc_not_zero(&dst->__refcnt);
++	return rcuref_get(&dst->__refcnt);
+ }
+ 
+ /**
+diff --git a/include/net/sock.h b/include/net/sock.h
+index c6584a352463..dbf85161c0c7 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2159,7 +2159,7 @@ sk_dst_get(struct sock *sk)
+ 
+ 	rcu_read_lock();
+ 	dst = rcu_dereference(sk->sk_dst_cache);
+-	if (dst && !atomic_inc_not_zero(&dst->__refcnt))
++	if (dst && !rcuref_get(&dst->__refcnt))
+ 		dst = NULL;
+ 	rcu_read_unlock();
+ 	return dst;
 diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h
 index d174914a837d..bf8e2af101a3 100644
 --- a/include/uapi/linux/if_bonding.h
@@ -10765,10 +10972,10 @@ index 811e94daf0a8..06fef7f97c02 100644
  	md_run_setup();
  
 diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
-index 44873594de03..fe62d59f2bdc 100644
+index 84d5b649b95f..e341ca8731f7 100644
 --- a/kernel/locking/rwsem.c
 +++ b/kernel/locking/rwsem.c
-@@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+@@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
  	struct task_struct *new, *owner;
  	unsigned long flags, new_flags;
  	enum owner_state state;
@@ -10776,7 +10983,7 @@ index 44873594de03..fe62d59f2bdc 100644
  
  	lockdep_assert_preemption_disabled();
  
-@@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
+@@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem)
  			break;
  		}
  
@@ -10841,6 +11048,19 @@ index 8e61f21e7e33..be1439d38f26 100644
  static int __read_mostly nmi_watchdog_available;
  
  struct cpumask watchdog_cpumask __read_mostly;
+diff --git a/lib/Makefile b/lib/Makefile
+index 4d9461bfea42..71c9627153b8 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \
+ 	 list_sort.o uuid.o iov_iter.o clz_ctz.o \
+ 	 bsearch.o find_bit.o llist.o memweight.o kfifo.o \
+ 	 percpu-refcount.o rhashtable.o base64.o \
+-	 once.o refcount.o usercopy.o errseq.o bucket_locks.o \
++	 once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \
+ 	 generic-radix-tree.o
+ obj-$(CONFIG_STRING_SELFTEST) += test_string.o
+ obj-y += string_helpers.o
 diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
 index a22a05c9af8a..a70bcbbd1673 100644
 --- a/lib/raid6/algos.c
@@ -10857,6 +11077,323 @@ index a22a05c9af8a..a70bcbbd1673 100644
  
  	if (best) {
  		raid6_2data_recov = best->data2;
+diff --git a/lib/rcuref.c b/lib/rcuref.c
+new file mode 100644
+index 000000000000..34fa40618fca
+--- /dev/null
++++ b/lib/rcuref.c
+@@ -0,0 +1,311 @@
++// SPDX-License-Identifier: GPL-2.0-only
++
++/*
++ * rcuref - A scalable reference count implementation for RCU managed objects
++ *
++ * rcuref is provided to replace open coded reference count implementations
++ * based on atomic_t. It protects explicitely RCU managed objects which can
++ * be visible even after the last reference has been dropped and the object
++ * is heading towards destruction.
++ *
++ * A common usage pattern is:
++ *
++ * get()
++ *	rcu_read_lock();
++ *	p = get_ptr();
++ *	if (p && !atomic_inc_not_zero(&p->refcnt))
++ *		p = NULL;
++ *	rcu_read_unlock();
++ *	return p;
++ *
++ * put()
++ *	if (!atomic_dec_return(&->refcnt)) {
++ *		remove_ptr(p);
++ *		kfree_rcu((p, rcu);
++ *	}
++ *
++ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has
++ * O(N^2) behaviour under contention with N concurrent operations.
++ *
++ * rcuref uses atomic_fetch_add_relaxed() and atomic_fetch_sub_release()
++ * for the fast path, which scale better under contention.
++ *
++ * Why not refcount?
++ * =================
++ *
++ * In principle it should be possible to make refcount use the rcuref
++ * scheme, but the destruction race described below cannot be prevented
++ * unless the protected object is RCU managed.
++ *
++ * Theory of operation
++ * ===================
++ *
++ * rcuref uses an unsigned integer reference counter. As long as the
++ * counter value is greater than or equal to RCUREF_ONEREF and not larger
++ * than RCUREF_MAXREF the reference is alive:
++ *
++ * NOREF ONEREF   MAXREF             SATURATED             RELEASED      DEAD
++ * 0     1      0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF
++ * <---valid ------------> <-------saturation zone-------> <-----------dead zone---------->
++ *
++ * The get() and put() operations do unconditional increments and
++ * decrements. The result is checked after the operation. This optimizes
++ * for the fast path.
++ *
++ * If the reference count is saturated or dead, then the increments and
++ * decrements are not harmful as the reference count still stays in the
++ * respective zones and is always set back to STATURATED resp. DEAD. The
++ * zones have room for 2^28 racing operations in each direction, which
++ * makes it practically impossible to escape the zones.
++ *
++ * Once the last reference is dropped the reference count becomes
++ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The
++ * slowpath then tries to set the reference count from RCUREF_NOREF to
++ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a
++ * concurrent rcuref_get() can acquire the reference count and bring it
++ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD.
++ *
++ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in
++ * DEAD + 1, which is inside the dead zone. If that happens the reference
++ * count is put back to DEAD.
++ *
++ * The actual race is possible due to the unconditional increment and
++ * decrements in rcuref_get() and rcuref_put():
++ *
++ *	T1				T2
++ *	get()				put()
++ *					if (atomic_fetch_sub(1, &ref->refcnt) >= 0)
++ *		succeeds->			atomic_try_cmpxchg(&ref->refcnt, -1, DEAD);
++ *
++ *	old = atomic_fetch_add(1, &ref->refcnt);	<- Elevates refcount to DEAD + 1
++ *
++ * As @old observed by T1 is within the dead zone the T1 get() fails.
++ *
++ * Possible critical states:
++ *
++ *	Context Counter	References	Operation
++ *	T1	1	1		init()
++ *	T2	2	2		get()
++ *	T1	1	1		put()
++ *	T2      0	0		put() tries to mark dead
++ *	T1	1	1		get()
++ *	T2	1	1		put() mark dead fails
++ *	T1      0	0		put() tries to mark dead
++ *	T1    DEAD	0		put() mark dead succeeds
++ *	T2    DEAD+1	0		get() fails and puts it back to DEAD
++ *
++ * Of course there are more complex scenarios, but the above illustrates
++ * the working principle. The rest is left to the imagination of the
++ * reader.
++ *
++ * Deconstruction race
++ * ===================
++ *
++ * The release operation must be protected by prohibiting a grace period in
++ * order to prevent a possible use after free:
++ *
++ *	T1				T2
++ *	put()				get()
++ *	// ref->refcnt = ONEREF
++ *	if (atomic_fetch_sub(1, &ref->cnt) > ONEREF)
++ *		return false;				<- Not taken
++ *
++ *	// ref->refcnt == NOREF
++ *	--> preemption
++ *					// Elevates ref->c to ONEREF
++ *					if (!atomic_fetch_add(1, &ref->refcnt) >= NOREF)
++ *						return true;			<- taken
++ *
++ *					if (put(&p->ref)) { <-- Succeeds
++ *						remove_pointer(p);
++ *						kfree_rcu(p, rcu);
++ *					}
++ *
++ *		RCU grace period ends, object is freed
++ *
++ *	atomic_cmpxchg(&ref->refcnt, NONE, DEAD);	<- UAF
++ *
++ * This is prevented by disabling preemption around the put() operation as
++ * that's in most kernel configurations cheaper than a rcu_read_lock() /
++ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it
++ * prevents the grace period which keeps the object alive until all put()
++ * operations complete.
++ *
++ * Saturation protection
++ * =====================
++ *
++ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX).
++ * Once this is exceedded the reference count becomes stale by setting it
++ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents
++ * wrap arounds which obviously cause worse problems than a memory
++ * leak. When saturation is reached a warning is emitted.
++ *
++ * Race conditions
++ * ===============
++ *
++ * All reference count increment/decrement operations are unconditional and
++ * only verified after the fact. This optimizes for the good case and takes
++ * the occasional race vs. a dead or already saturated refcount into
++ * account. The saturation and dead zones are large enough to accomodate
++ * for that.
++ *
++ * Memory ordering
++ * ===============
++ *
++ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions
++ * and provide only what is strictly required for refcounts.
++ *
++ * The increments are fully relaxed; these will not provide ordering. The
++ * rationale is that whatever is used to obtain the object to increase the
++ * reference count on will provide the ordering. For locked data
++ * structures, its the lock acquire, for RCU/lockless data structures its
++ * the dependent load.
++ *
++ * rcuref_get() provides a control dependency ordering future stores which
++ * ensures that the object is not modified when acquiring a reference
++ * fails.
++ *
++ * rcuref_put() provides release order, i.e. all prior loads and stores
++ * will be issued before. It also provides a control dependency ordering
++ * against the subsequent destruction of the object.
++ *
++ * If rcuref_put() successfully dropped the last reference and marked the
++ * object DEAD it also provides acquire ordering.
++ */
++
++#include <linux/export.h>
++#include <linux/rcuref.h>
++
++/**
++ * rcuref_get_slowpath - Slowpath of rcuref_get()
++ * @ref:	Pointer to the reference count
++ * @old:	The reference count before the unconditional increment
++ *		operation in rcuref_get()
++ *
++ * Invoked when the reference count is outside of the valid zone.
++ *
++ * Return:
++ *	False if the reference count was already marked dead
++ *
++ *	True if the reference count is saturated, which prevents the
++ *	object from being deconstructed ever.
++ */
++bool rcuref_get_slowpath(rcuref_t *ref, unsigned int old)
++{
++	/*
++	 * If the reference count was already marked dead, undo the
++	 * increment so it stays in the middle of the dead zone and return
++	 * fail.
++	 */
++	if (old >= RCUREF_RELEASED) {
++		atomic_set(&ref->refcnt, RCUREF_DEAD);
++		return false;
++	}
++
++	/*
++	 * If it was saturated, warn and mark it so. In case the increment
++	 * was already on a saturated value restore the saturation
++	 * marker. This keeps it in the middle of the saturation zone and
++	 * prevents the reference count from overflowing. This leaks the
++	 * object memory, but prevents the obvious reference count overflow
++	 * damage.
++	 */
++	WARN_ONCE(old >= RCUREF_MAXREF, "rcuref saturated - leaking memory");
++	atomic_set(&ref->refcnt, RCUREF_SATURATED);
++	return true;
++}
++EXPORT_SYMBOL_GPL(rcuref_get_slowpath);
++
++static __must_check bool __rcuref_put(rcuref_t *ref)
++{
++	/*
++	 * Unconditionally decrement the reference count. The saturation and
++	 * dead zones provide enough tolerance for this.
++	 */
++	unsigned int old = atomic_fetch_sub_release(1, &ref->refcnt);
++
++	/*
++	 * If the old value is in the valid range and is greater than
++	 * RCUREF_ONEREF, nothing to do.
++	 */
++	if (likely(old > RCUREF_ONEREF && old <= RCUREF_MAXREF))
++		return false;
++
++	/* Did this drop the last reference? */
++	if (likely(old == RCUREF_ONEREF)) {
++		/*
++		 * Carefully try to set the reference count to RCUREF_DEAD.
++		 *
++		 * This can fail if a concurrent get() operation has
++		 * elevated it again or the corresponding put() even marked
++		 * it dead already. Both are valid situations and do not
++		 * require a retry. If this fails the caller is not
++		 * allowed to deconstruct the object.
++		 */
++		if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF)
++			return false;
++
++		/*
++		 * The caller can safely schedule the object for
++		 * deconstruction. Provide acquire ordering.
++		 */
++		smp_acquire__after_ctrl_dep();
++		return true;
++	}
++
++	/*
++	 * If the reference count was already in the dead zone, then this
++	 * put() operation is imbalanced. Warn, put the reference count back to
++	 * DEAD and tell the caller to not deconstruct the object.
++	 */
++	if (WARN_ONCE(old >= RCUREF_RELEASED, "rcuref - imbalanced put()")) {
++		atomic_set(&ref->refcnt, RCUREF_DEAD);
++		return false;
++	}
++
++	/*
++	 * This is a put() operation on a saturated refcount. Restore the
++	 * mean saturation value and tell the caller to not deconstruct the
++	 * object.
++	 */
++	atomic_set(&ref->refcnt, RCUREF_SATURATED);
++	return false;
++}
++
++/**
++ * rcuref_put -- Release one reference for a rcuref reference count
++ * @ref:	Pointer to the reference count
++ *
++ * Can be invoked from any context.
++ *
++ * Provides release memory ordering, such that prior loads and stores are done
++ * before, and provides an acquire ordering on success such that free()
++ * must come after.
++ *
++ * Return:
++ *
++ *	True if this was the last reference with no future references
++ *	possible. This signals the caller that it can safely schedule the
++ *	object, which is protected by the reference counter, for
++ *	deconstruction.
++ *
++ *	False if there are still active references or the put() raced
++ *	with a concurrent get()/put() pair. Caller is not allowed to
++ *	deconstruct the protected object.
++ */
++bool rcuref_put(rcuref_t *ref)
++{
++	bool released;
++
++	/*
++	 * Protect against a concurrent get()/put() pair which marks the
++	 * reference count DEAD and schedules it for RCU free. This
++	 * prevents a grace period and is cheaper than
++	 * rcu_read_lock()/unlock().
++	 */
++	preempt_disable();
++	released = __rcuref_put(ref);
++	preempt_enable();
++	return released;
++}
++EXPORT_SYMBOL_GPL(rcuref_put);
 diff --git a/mm/ksm.c b/mm/ksm.c
 index addf490da146..a92c9594a2d3 100644
 --- a/mm/ksm.c
@@ -10879,6 +11416,81 @@ index addf490da146..a92c9594a2d3 100644
  		} else {
  			wait_event_freezable(ksm_thread_wait,
  				ksmd_should_run() || kthread_should_stop());
+diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c
+index 8c69f0c95a8e..c2b628e3cc7f 100644
+--- a/net/bridge/br_nf_core.c
++++ b/net/bridge/br_nf_core.c
+@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br)
+ {
+ 	struct rtable *rt = &br->fake_rtable;
+ 
+-	atomic_set(&rt->dst.__refcnt, 1);
++	rcuref_init(&rt->dst.__refcnt, 1);
+ 	rt->dst.dev = br->dev;
+ 	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+ 	rt->dst.flags	= DST_NOXFRM | DST_FAKE_RTABLE;
+diff --git a/net/core/dst.c b/net/core/dst.c
+index 6d2dd03dafa8..750440803883 100644
+--- a/net/core/dst.c
++++ b/net/core/dst.c
+@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops,
+ 	dst->tclassid = 0;
+ #endif
+ 	dst->lwtstate = NULL;
+-	atomic_set(&dst->__refcnt, initial_ref);
++	rcuref_init(&dst->__refcnt, initial_ref);
+ 	dst->__use = 0;
+ 	dst->lastuse = jiffies;
+ 	dst->flags = flags;
+@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put);
+ 
+ void dst_release(struct dst_entry *dst)
+ {
+-	if (dst) {
+-		int newrefcnt;
+-
+-		newrefcnt = atomic_dec_return(&dst->__refcnt);
+-		if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
+-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+-					     __func__, dst, newrefcnt);
+-		if (!newrefcnt)
+-			call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
+-	}
++	if (dst && rcuref_put(&dst->__refcnt))
++		call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
+ }
+ EXPORT_SYMBOL(dst_release);
+ 
+ void dst_release_immediate(struct dst_entry *dst)
+ {
+-	if (dst) {
+-		int newrefcnt;
+-
+-		newrefcnt = atomic_dec_return(&dst->__refcnt);
+-		if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
+-			net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
+-					     __func__, dst, newrefcnt);
+-		if (!newrefcnt)
+-			dst_destroy(dst);
+-	}
++	if (dst && rcuref_put(&dst->__refcnt))
++		dst_destroy(dst);
+ }
+ EXPORT_SYMBOL(dst_release_immediate);
+ 
+diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
+index 64289bc98887..228c54bbdecc 100644
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
+ 	if (dst) {
+ 		ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
+ 		ci.rta_used = dst->__use;
+-		ci.rta_clntref = atomic_read(&dst->__refcnt);
++		ci.rta_clntref = rcuref_read(&dst->__refcnt);
+ 	}
+ 	if (expires) {
+ 		unsigned long clock;
 diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
 index f2c43f67187d..9885bfb429a2 100644
 --- a/net/ipv4/inet_connection_sock.c
@@ -10907,13 +11519,66 @@ index e9e8040d6491..f9b56123b3b8 100644
  
  	init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE;
  	init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
+diff --git a/net/ipv6/route.c b/net/ipv6/route.c
+index a6983a13dd20..8b5e3d57b08d 100644
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = {
+ 
+ static const struct rt6_info ip6_null_entry_template = {
+ 	.dst = {
+-		.__refcnt	= ATOMIC_INIT(1),
++		.__refcnt	= RCUREF_INIT(1),
+ 		.__use		= 1,
+ 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
+ 		.error		= -ENETUNREACH,
+@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = {
+ 
+ static const struct rt6_info ip6_prohibit_entry_template = {
+ 	.dst = {
+-		.__refcnt	= ATOMIC_INIT(1),
++		.__refcnt	= RCUREF_INIT(1),
+ 		.__use		= 1,
+ 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
+ 		.error		= -EACCES,
+@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = {
+ 
+ static const struct rt6_info ip6_blk_hole_entry_template = {
+ 	.dst = {
+-		.__refcnt	= ATOMIC_INIT(1),
++		.__refcnt	= RCUREF_INIT(1),
+ 		.__use		= 1,
+ 		.obsolete	= DST_OBSOLETE_FORCE_CHK,
+ 		.error		= -EINVAL,
+diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
+index 029171379884..bc9dc51828f7 100644
+--- a/net/netfilter/ipvs/ip_vs_xmit.c
++++ b/net/netfilter/ipvs/ip_vs_xmit.c
+@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ 			spin_unlock_bh(&dest->dst_lock);
+ 			IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
+ 				  &dest->addr.ip, &dest_dst->dst_saddr.ip,
+-				  atomic_read(&rt->dst.__refcnt));
++				  rcuref_read(&rt->dst.__refcnt));
+ 		}
+ 		if (ret_saddr)
+ 			*ret_saddr = dest_dst->dst_saddr.ip;
+@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
+ 			spin_unlock_bh(&dest->dst_lock);
+ 			IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
+ 				  &dest->addr.in6, &dest_dst->dst_saddr.in6,
+-				  atomic_read(&rt->dst.__refcnt));
++				  rcuref_read(&rt->dst.__refcnt));
+ 		}
+ 		if (ret_saddr)
+ 			*ret_saddr = dest_dst->dst_saddr.in6;
 -- 
-2.39.2
+2.40.0.rc2
 
-From ca9964f0e4522dd46497aaa1736c860ebff85d2e Mon Sep 17 00:00:00 2001
+From ed2979f1636e3197b42234c8acac4d20f4e2ed8e Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 25 Feb 2023 14:41:15 +0100
-Subject: [PATCH 06/15] fixes
+Date: Fri, 10 Mar 2023 18:03:29 +0100
+Subject: [PATCH 06/16] fixes
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -10923,41 +11588,41 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  Documentation/leds/index.rst                  |    1 +
  Documentation/leds/ledtrig-blkdev.rst         |  158 +++
  arch/x86/boot/compressed/Makefile             |    2 +-
- arch/x86/kernel/acpi/boot.c                   |   19 +-
+ arch/x86/events/rapl.c                        |   20 +-
+ arch/x86/kernel/cpu/amd.c                     |    9 +
  arch/x86/mm/tlb.c                             |    2 +-
- drivers/acpi/acpica/Makefile                  |    2 +-
- drivers/bluetooth/btusb.c                     |    9 +
- drivers/char/tpm/tpm-chip.c                   |   62 +-
+ arch/x86/net/bpf_jit_comp.c                   |    5 +-
+ drivers/bluetooth/btusb.c                     |    2 +-
+ drivers/char/tpm/tpm-chip.c                   |   60 +-
  drivers/char/tpm/tpm.h                        |   73 +
- drivers/hwmon/nct6775-core.c                  |    2 +-
  drivers/leds/trigger/Kconfig                  |    9 +
  drivers/leds/trigger/Makefile                 |    1 +
- drivers/leds/trigger/ledtrig-blkdev.c         | 1220 +++++++++++++++++
+ drivers/leds/trigger/ledtrig-blkdev.c         | 1221 +++++++++++++++++
  fs/eventpoll.c                                |    2 +-
- fs/nfsd/filecache.c                           |   44 +-
- fs/nfsd/trace.h                               |   31 -
  fs/proc/base.c                                |    1 +
  include/linux/mm_types.h                      |    7 +-
  include/linux/pageblock-flags.h               |    2 +-
+ kernel/kheaders.c                             |   10 +-
  kernel/kthread.c                              |    5 +
  kernel/padata.c                               |    4 +-
  lib/string.c                                  |   10 +-
  lib/zstd/decompress/huf_decompress.c          |    2 +-
  mm/compaction.c                               |   75 +-
  mm/internal.h                                 |    6 +-
- mm/ksm.c                                      |  185 ++-
+ mm/ksm.c                                      |  196 ++-
+ mm/page_alloc.c                               |   22 +-
  mm/z3fold.c                                   |    2 -
  mm/zsmalloc.c                                 |    3 -
  scripts/Makefile.vmlinux_o                    |    2 +-
  sound/pci/hda/cs35l41_hda.c                   |    2 +-
  .../selftests/vm/ksm_functional_tests.c       |   96 +-
- 34 files changed, 1975 insertions(+), 159 deletions(-)
+ 34 files changed, 1995 insertions(+), 110 deletions(-)
  create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
  create mode 100644 Documentation/leds/ledtrig-blkdev.rst
  create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c
 
 diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
-index cd14ecb3c9a5..853cb2601242 100644
+index cd14ecb3c9a5..ad47337ac75a 100644
 --- a/Documentation/ABI/stable/sysfs-block
 +++ b/Documentation/ABI/stable/sysfs-block
 @@ -101,6 +101,16 @@ Description:
@@ -10965,7 +11630,7 @@ index cd14ecb3c9a5..853cb2601242 100644
  
  
 +What:		/sys/block/<disk>/linked_leds
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Directory that contains symbolic links to all LEDs that
@@ -10979,19 +11644,19 @@ index cd14ecb3c9a5..853cb2601242 100644
  Contact:	Martin K. Petersen <martin.petersen@oracle.com>
 diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
 new file mode 100644
-index 000000000000..45275eb0bad3
+index 000000000000..28ce8c814fb7
 --- /dev/null
 +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev
 @@ -0,0 +1,78 @@
 +What:		/sys/class/leds/<led>/blink_time
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Time (in milliseconds) that the LED will be on during a single
 +		"blink".
 +
 +What:		/sys/class/leds/<led>/check_interval
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Interval (in milliseconds) between checks of the block devices
@@ -11001,35 +11666,35 @@ index 000000000000..45275eb0bad3
 +		check.
 +
 +What:		/sys/class/leds/<led>/blink_on_read
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Boolean that determines whether the LED will blink in response
 +		to read activity on any of its linked block devices.
 +
 +What:		/sys/class/leds/<led>/blink_on_write
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Boolean that determines whether the LED will blink in response
 +		to write activity on any of its linked block devices.
 +
 +What:		/sys/class/leds/<led>/blink_on_discard
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Boolean that determines whether the LED will blink in response
 +		to discard activity on any of its linked block devices.
 +
 +What:		/sys/class/leds/<led>/blink_on_flush
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gamil.com>
 +Description:
 +		Boolean that determines whether the LED will blink in response
 +		to cache flush activity on any of its linked block devices.
 +
 +What:		/sys/class/leds/<led>/link_dev_by_path
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Associate a block device with this LED by writing the path to
@@ -11037,7 +11702,7 @@ index 000000000000..45275eb0bad3
 +		Symbolic links are followed.
 +
 +What:		/sys/class/leds/<led>/unlink_dev_by_path
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Remove the association between this LED and a block device by
@@ -11045,7 +11710,7 @@ index 000000000000..45275eb0bad3
 +		this attribute.  Symbolic links are followed.
 +
 +What:		/sys/class/leds/<led>/unlink_dev_by_name
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Remove the association between this LED and a block device by
@@ -11053,7 +11718,7 @@ index 000000000000..45275eb0bad3
 +		attribute.
 +
 +What:		/sys/class/leds/<led>/linked_devices
-+Date:		October 2022
++Date:		January 2023
 +Contact:	Ian Pilcher <arequipeno@gmail.com>
 +Description:
 +		Directory containing links to all block devices that are
@@ -11268,50 +11933,84 @@ index d995595394bb..19d1fb601796 100644
  KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h
  
  # sev.c indirectly inludes inat-table.h which is generated during
-diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
-index 907cc98b1938..518bda50068c 100644
---- a/arch/x86/kernel/acpi/boot.c
-+++ b/arch/x86/kernel/acpi/boot.c
-@@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled)
- 	return cpu;
+diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
+index 52e6e7ed4f78..f000cc16d128 100644
+--- a/arch/x86/events/rapl.c
++++ b/arch/x86/events/rapl.c
+@@ -343,14 +343,15 @@ static int rapl_pmu_event_init(struct perf_event *event)
+ 	if (event->cpu < 0)
+ 		return -EINVAL;
+ 
+-	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
+-
+ 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
+ 		return -EINVAL;
+ 
+ 	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
+ 	bit = cfg - 1;
+ 
++	if (bit != PERF_RAPL_PP0)
++		event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
++
+ 	/* check event supported */
+ 	if (!(rapl_cntr_mask & (1 << bit)))
+ 		return -EINVAL;
+@@ -363,7 +364,15 @@ static int rapl_pmu_event_init(struct perf_event *event)
+ 	pmu = cpu_to_rapl_pmu(event->cpu);
+ 	if (!pmu)
+ 		return -EINVAL;
+-	event->cpu = pmu->cpu;
++
++	/*
++	 * FIXME: RAPL PMU considers events are uncore and MSRs can be read from
++	 * the first available CPU of the die. But this is not true for energy-cores
++	 * event. Therefore as a workaround don't consider pmu->cpu here for PERF_RAPL_PP0.
++	 */
++	if (event->event_caps & PERF_EV_CAP_READ_ACTIVE_PKG)
++		event->cpu = pmu->cpu;
++
+ 	event->pmu_private = pmu;
+ 	event->hw.event_base = rapl_msrs[bit].msr;
+ 	event->hw.config = cfg;
+@@ -537,7 +546,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
+  * - want to use same event codes across both architectures
+  */
+ static struct perf_msr amd_rapl_msrs[] = {
+-	[PERF_RAPL_PP0]  = { 0, &rapl_events_cores_group, 0, false, 0 },
++	[PERF_RAPL_PP0]  = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK },
+ 	[PERF_RAPL_PKG]  = { MSR_AMD_PKG_ENERGY_STATUS,  &rapl_events_pkg_group,   test_msr, false, RAPL_MSR_MASK },
+ 	[PERF_RAPL_RAM]  = { 0, &rapl_events_ram_group,   0, false, 0 },
+ 	[PERF_RAPL_PP1]  = { 0, &rapl_events_gpu_group,   0, false, 0 },
+@@ -764,7 +773,8 @@ static struct rapl_model model_spr = {
+ };
+ 
+ static struct rapl_model model_amd_hygon = {
+-	.events		= BIT(PERF_RAPL_PKG),
++	.events		= BIT(PERF_RAPL_PP0) |
++			  BIT(PERF_RAPL_PKG),
+ 	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
+ 	.rapl_msrs      = amd_rapl_msrs,
+ };
+diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
+index f769d6d08b43..06f2ede1544f 100644
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -880,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c)
+ 		}
+ 	}
+ #endif
++	/*
++	 * Work around Erratum 1386.  The XSAVES instruction malfunctions in
++	 * certain circumstances on Zen1/2 uarch, and not all parts have had
++	 * updated microcode at the time of writing (March 2023).
++	 *
++	 * Affected parts all have no supervisor XSAVE states, meaning that
++	 * the XSAVEC instruction (which works fine) is equivalent.
++	 */
++	clear_cpu_cap(c, X86_FEATURE_XSAVES);
  }
  
-+static bool __init acpi_is_processor_usable(u32 lapic_flags)
-+{
-+	if (lapic_flags & ACPI_MADT_ENABLED)
-+		return true;
-+
-+	if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
-+		return true;
-+
-+	return false;
-+}
-+
- static int __init
- acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
- {
-@@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end)
- 	if (apic_id == 0xffffffff)
- 		return 0;
- 
-+	/* don't register processors that cannot be onlined */
-+	if (!acpi_is_processor_usable(processor->lapic_flags))
-+		return 0;
-+
- 	/*
- 	 * We need to register disabled CPU as well to permit
- 	 * counting disabled CPUs. This allows us to size
-@@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end)
- 		return 0;
- 
- 	/* don't register processors that can not be onlined */
--	if (acpi_support_online_capable &&
--	    !(processor->lapic_flags & ACPI_MADT_ENABLED) &&
--	    !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE))
-+	if (!acpi_is_processor_usable(processor->lapic_flags))
- 		return 0;
- 
- 	/*
+ static void init_amd_zn(struct cpuinfo_x86 *c)
 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
 index c1e31e9a85d7..92d73ccede70 100644
 --- a/arch/x86/mm/tlb.c
@@ -11325,68 +12024,58 @@ index c1e31e9a85d7..92d73ccede70 100644
  		__flush_tlb_global();
  	} else {
  		/*
-diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile
-index 9e0d95d76fff..30f3fc13c29d 100644
---- a/drivers/acpi/acpica/Makefile
-+++ b/drivers/acpi/acpica/Makefile
-@@ -3,7 +3,7 @@
- # Makefile for ACPICA Core interpreter
- #
+diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
+index b808be77635e..6e696c6b7018 100644
+--- a/arch/x86/net/bpf_jit_comp.c
++++ b/arch/x86/net/bpf_jit_comp.c
+@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip)
  
--ccflags-y			:= -Os -D_LINUX -DBUILDING_ACPICA
-+ccflags-y			:= -D_LINUX -DBUILDING_ACPICA
- ccflags-$(CONFIG_ACPI_DEBUG)	+= -DACPI_DEBUG_OUTPUT
+ static int emit_rsb_call(u8 **pprog, void *func, void *ip)
+ {
++	void *adjusted_ip;
+ 	OPTIMIZER_HIDE_VAR(func);
+-	x86_call_depth_emit_accounting(pprog, func);
+-	return emit_patch(pprog, func, ip, 0xE8);
++	adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func);
++	return emit_patch(pprog, func, adjusted_ip, 0xE8);
+ }
  
- # use acpi.o to put all files here into acpi.o modparam namespace
+ static int emit_jump(u8 **pprog, void *func, void *ip)
 diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
-index 2ad4efdd9e40..afd2f08ffe30 100644
+index 18bc94718711..7b9ee86b4609 100644
 --- a/drivers/bluetooth/btusb.c
 +++ b/drivers/bluetooth/btusb.c
-@@ -64,6 +64,7 @@ static struct usb_driver btusb_driver;
- #define BTUSB_INTEL_BROKEN_SHUTDOWN_LED	BIT(24)
- #define BTUSB_INTEL_BROKEN_INITIAL_NCMD BIT(25)
- #define BTUSB_INTEL_NO_WBS_SUPPORT	BIT(26)
-+#define BTUSB_ACTIONS_SEMI		BIT(27)
+@@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
+ 		}
  
- static const struct usb_device_id btusb_table[] = {
- 	/* Generic Bluetooth USB device */
-@@ -677,6 +678,9 @@ static const struct usb_device_id blacklist_table[] = {
- 	{ USB_DEVICE(0x0cb5, 0xc547), .driver_info = BTUSB_REALTEK |
- 						     BTUSB_WIDEBAND_SPEECH },
- 
-+	/* Actions Semiconductor ATS2851 based devices */
-+	{ USB_DEVICE(0x10d7, 0xb012), .driver_info = BTUSB_ACTIONS_SEMI },
-+
- 	/* Silicon Wave based devices */
- 	{ USB_DEVICE(0x0c10, 0x0000), .driver_info = BTUSB_SWAVE },
- 
-@@ -4098,6 +4102,11 @@ static int btusb_probe(struct usb_interface *intf,
- 		set_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags);
- 	}
- 
-+	if (id->driver_info & BTUSB_ACTIONS_SEMI) {
-+		/* Support is advertised, but not implemented */
-+		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
-+	}
-+
- 	if (!reset)
- 		set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks);
+ 		gpiod_set_value_cansleep(reset_gpio, 0);
+-		msleep(200);
++		usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC);
+ 		gpiod_set_value_cansleep(reset_gpio, 1);
  
+ 		return;
 diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c
-index 741d8f3e8fb3..348dd5705fbb 100644
+index 741d8f3e8fb3..c467eeae9973 100644
 --- a/drivers/char/tpm/tpm-chip.c
 +++ b/drivers/char/tpm/tpm-chip.c
-@@ -512,6 +512,65 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip)
+@@ -512,6 +512,63 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip)
  	return 0;
  }
  
-+static bool tpm_is_rng_defective(struct tpm_chip *chip)
++/*
++ * Some AMD fTPM versions may cause stutter
++ * https://www.amd.com/en/support/kb/faq/pa-410
++ *
++ * Fixes are available in two series of fTPM firmware:
++ * 6.x.y.z series: 6.0.18.6 +
++ * 3.x.y.z series: 3.57.y.5 +
++ */
++static bool tpm_amd_is_rng_defective(struct tpm_chip *chip)
 +{
-+	int ret;
-+	u64 version;
 +	u32 val1, val2;
++	u64 version;
++	int ret;
 +
-+	/* No known-broken TPM1 chips. */
 +	if (!(chip->flags & TPM_CHIP_FLAG_TPM2))
 +		return false;
 +
@@ -11394,7 +12083,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644
 +	if (ret)
 +		return false;
 +
-+	/* Some AMD fTPM versions may cause stutter */
 +	ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL);
 +	if (ret)
 +		goto release;
@@ -11406,8 +12094,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644
 +	if (ret)
 +		goto release;
 +	ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL);
-+	if (ret)
-+		goto release;
 +
 +release:
 +	tpm_relinquish_locality(chip);
@@ -11416,13 +12102,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644
 +		return false;
 +
 +	version = ((u64)val1 << 32) | val2;
-+	/*
-+	 * Fixes for stutter as described in
-+	 * https://www.amd.com/en/support/kb/faq/pa-410
-+	 * are available in two series of fTPM firmware:
-+	 *   6.x.y.z series: 6.0.18.6 +
-+	 *   3.x.y.z series: 3.57.x.5 +
-+	 */
 +	if ((version >> 48) == 6) {
 +		if (version >= 0x0006000000180006ULL)
 +			return false;
@@ -11432,6 +12111,7 @@ index 741d8f3e8fb3..348dd5705fbb 100644
 +	} else {
 +		return false;
 +	}
++
 +	dev_warn(&chip->dev,
 +		 "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n",
 +		 version);
@@ -11442,13 +12122,13 @@ index 741d8f3e8fb3..348dd5705fbb 100644
  static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
  {
  	struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng);
-@@ -521,7 +580,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
+@@ -521,7 +578,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait)
  
  static int tpm_add_hwrng(struct tpm_chip *chip)
  {
 -	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip))
 +	if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) ||
-+	    tpm_is_rng_defective(chip))
++	    tpm_amd_is_rng_defective(chip))
  		return 0;
  
  	snprintf(chip->hwrng_name, sizeof(chip->hwrng_name),
@@ -11536,19 +12216,6 @@ index 24ee4e1cc452..830014a26609 100644
  
  /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18
   * bytes, but 128 is still a relatively large number of random bytes and
-diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c
-index da9ec6983e13..c54233f0369b 100644
---- a/drivers/hwmon/nct6775-core.c
-+++ b/drivers/hwmon/nct6775-core.c
-@@ -1150,7 +1150,7 @@ static int nct6775_write_fan_div(struct nct6775_data *data, int nr)
- 	if (err)
- 		return err;
- 	reg &= 0x70 >> oddshift;
--	reg |= data->fan_div[nr] & (0x7 << oddshift);
-+	reg |= (data->fan_div[nr] & 0x7) << oddshift;
- 	return nct6775_write_value(data, fandiv_reg, reg);
- }
- 
 diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
 index dc6816d36d06..bda249068182 100644
 --- a/drivers/leds/trigger/Kconfig
@@ -11578,10 +12245,10 @@ index 25c4db97cdd4..d53bab5d93f1 100644
 +obj-$(CONFIG_LEDS_TRIGGER_BLKDEV)	+= ledtrig-blkdev.o
 diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c
 new file mode 100644
-index 000000000000..8614e308fadc
+index 000000000000..067eedb003b5
 --- /dev/null
 +++ b/drivers/leds/trigger/ledtrig-blkdev.c
-@@ -0,0 +1,1220 @@
+@@ -0,0 +1,1221 @@
 +// SPDX-License-Identifier: GPL-2.0-only
 +
 +/*
@@ -12438,7 +13105,7 @@ index 000000000000..8614e308fadc
 +{
 +	const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
 +
-+	return sprintf(buf, "%u\n", READ_ONCE(btl->blink_msec));
++	return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec));
 +}
 +
 +/**
@@ -12489,8 +13156,8 @@ index 000000000000..8614e308fadc
 +{
 +	struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev);
 +
-+	return sprintf(buf, "%u\n",
-+		       jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
++	return sysfs_emit(buf, "%u\n",
++			  jiffies_to_msecs(READ_ONCE(btl->check_jiffies)));
 +}
 +
 +/**
@@ -12538,7 +13205,8 @@ index 000000000000..8614e308fadc
 +static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf,
 +				 enum stat_group bit)
 +{
-+	return sprintf(buf, READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
++	return sysfs_emit(buf,
++			  READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n");
 +}
 +
 +/**
@@ -12815,133 +13483,6 @@ index 64659b110973..8b5ca9f8f4bb 100644
  	return ret;
  }
  
-diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
-index c0950edb26b0..697acf5c3c68 100644
---- a/fs/nfsd/filecache.c
-+++ b/fs/nfsd/filecache.c
-@@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may)
- 	return nf;
- }
- 
-+/**
-+ * nfsd_file_check_write_error - check for writeback errors on a file
-+ * @nf: nfsd_file to check for writeback errors
-+ *
-+ * Check whether a nfsd_file has an unseen error. Reset the write
-+ * verifier if so.
-+ */
- static void
--nfsd_file_fsync(struct nfsd_file *nf)
--{
--	struct file *file = nf->nf_file;
--	int ret;
--
--	if (!file || !(file->f_mode & FMODE_WRITE))
--		return;
--	ret = vfs_fsync(file, 1);
--	trace_nfsd_file_fsync(nf, ret);
--	if (ret)
--		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
--}
--
--static int
- nfsd_file_check_write_error(struct nfsd_file *nf)
- {
- 	struct file *file = nf->nf_file;
- 
--	if (!file || !(file->f_mode & FMODE_WRITE))
--		return 0;
--	return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err));
-+	if ((file->f_mode & FMODE_WRITE) &&
-+	    filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)))
-+		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
- }
- 
- static void
- nfsd_file_hash_remove(struct nfsd_file *nf)
- {
- 	trace_nfsd_file_unhash(nf);
--
--	if (nfsd_file_check_write_error(nf))
--		nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id));
- 	rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash,
- 			       nfsd_file_rhash_params);
- }
-@@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf)
- 	this_cpu_add(nfsd_file_total_age, age);
- 
- 	nfsd_file_unhash(nf);
--
--	/*
--	 * We call fsync here in order to catch writeback errors. It's not
--	 * strictly required by the protocol, but an nfsd_file could get
--	 * evicted from the cache before a COMMIT comes in. If another
--	 * task were to open that file in the interim and scrape the error,
--	 * then the client may never see it. By calling fsync here, we ensure
--	 * that writeback happens before the entry is freed, and that any
--	 * errors reported result in the write verifier changing.
--	 */
--	nfsd_file_fsync(nf);
--
- 	if (nf->nf_mark)
- 		nfsd_file_mark_put(nf->nf_mark);
- 	if (nf->nf_file) {
- 		get_file(nf->nf_file);
- 		filp_close(nf->nf_file, NULL);
-+		nfsd_file_check_write_error(nf);
- 		fput(nf->nf_file);
- 	}
- 
-@@ -1159,6 +1138,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp,
- out:
- 	if (status == nfs_ok) {
- 		this_cpu_inc(nfsd_file_acquisitions);
-+		nfsd_file_check_write_error(nf);
- 		*pnf = nf;
- 	} else {
- 		if (refcount_dec_and_test(&nf->nf_ref))
-diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
-index 8f9c82d9e075..4183819ea082 100644
---- a/fs/nfsd/trace.h
-+++ b/fs/nfsd/trace.h
-@@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close,
- 	)
- );
- 
--TRACE_EVENT(nfsd_file_fsync,
--	TP_PROTO(
--		const struct nfsd_file *nf,
--		int ret
--	),
--	TP_ARGS(nf, ret),
--	TP_STRUCT__entry(
--		__field(void *, nf_inode)
--		__field(int, nf_ref)
--		__field(int, ret)
--		__field(unsigned long, nf_flags)
--		__field(unsigned char, nf_may)
--		__field(struct file *, nf_file)
--	),
--	TP_fast_assign(
--		__entry->nf_inode = nf->nf_inode;
--		__entry->nf_ref = refcount_read(&nf->nf_ref);
--		__entry->ret = ret;
--		__entry->nf_flags = nf->nf_flags;
--		__entry->nf_may = nf->nf_may;
--		__entry->nf_file = nf->nf_file;
--	),
--	TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d",
--		__entry->nf_inode,
--		__entry->nf_ref,
--		show_nf_flags(__entry->nf_flags),
--		show_nfsd_may_flags(__entry->nf_may),
--		__entry->nf_file, __entry->ret
--	)
--);
--
- #include "cache.h"
- 
- TRACE_DEFINE_ENUM(RC_DROPIT);
 diff --git a/fs/proc/base.c b/fs/proc/base.c
 index 9e479d7d202b..ac9ebe972be0 100644
 --- a/fs/proc/base.c
@@ -12992,6 +13533,40 @@ index 5f1ae07d724b..97cda629c9e9 100644
  
  #endif /* CONFIG_HUGETLB_PAGE */
  
+diff --git a/kernel/kheaders.c b/kernel/kheaders.c
+index 8f69772af77b..42163c9e94e5 100644
+--- a/kernel/kheaders.c
++++ b/kernel/kheaders.c
+@@ -26,15 +26,15 @@ asm (
+ "	.popsection				\n"
+ );
+ 
+-extern char kernel_headers_data;
+-extern char kernel_headers_data_end;
++extern char kernel_headers_data[];
++extern char kernel_headers_data_end[];
+ 
+ static ssize_t
+ ikheaders_read(struct file *file,  struct kobject *kobj,
+ 	       struct bin_attribute *bin_attr,
+ 	       char *buf, loff_t off, size_t len)
+ {
+-	memcpy(buf, &kernel_headers_data + off, len);
++	memcpy(buf, &kernel_headers_data[off], len);
+ 	return len;
+ }
+ 
+@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = {
+ 
+ static int __init ikheaders_init(void)
+ {
+-	kheaders_attr.size = (&kernel_headers_data_end -
+-			      &kernel_headers_data);
++	kheaders_attr.size = (kernel_headers_data_end -
++			      kernel_headers_data);
+ 	return sysfs_create_bin_file(kernel_kobj, &kheaders_attr);
+ }
+ 
 diff --git a/kernel/kthread.c b/kernel/kthread.c
 index f97fd01a2932..7e6751b29101 100644
 --- a/kernel/kthread.c
@@ -13244,7 +13819,7 @@ index bcf75a8b032d..21466d0ab22f 100644
  };
  
 diff --git a/mm/ksm.c b/mm/ksm.c
-index a92c9594a2d3..c267b92b837b 100644
+index a92c9594a2d3..ee60890cf9b1 100644
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
 @@ -214,6 +214,7 @@ struct ksm_rmap_item {
@@ -13399,7 +13974,33 @@ index a92c9594a2d3..c267b92b837b 100644
  	}
  	return err;
  }
-@@ -2044,6 +2094,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+@@ -988,9 +1038,15 @@ static int unmerge_and_remove_all_rmap_items(void)
+ 
+ 		mm = mm_slot->slot.mm;
+ 		mmap_read_lock(mm);
++
++		/*
++		 * Exit right away if mm is exiting to avoid lockdep issue in
++		 * the maple tree
++		 */
++		if (ksm_test_exit(mm))
++			goto mm_exiting;
++
+ 		for_each_vma(vmi, vma) {
+-			if (ksm_test_exit(mm))
+-				break;
+ 			if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma)
+ 				continue;
+ 			err = unmerge_ksm_pages(vma,
+@@ -999,6 +1055,7 @@ static int unmerge_and_remove_all_rmap_items(void)
+ 				goto error;
+ 		}
+ 
++mm_exiting:
+ 		remove_trailing_rmap_items(&mm_slot->rmap_list);
+ 		mmap_read_unlock(mm);
+ 
+@@ -2044,6 +2101,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
  	rmap_item->mm->ksm_merging_pages++;
  }
  
@@ -13442,7 +14043,7 @@ index a92c9594a2d3..c267b92b837b 100644
  /*
   * cmp_and_merge_page - first see if page can be merged into the stable tree;
   * if not, compare checksum to previous and if it's the same, see if page can
-@@ -2055,7 +2141,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
+@@ -2055,7 +2148,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item,
   */
  static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item)
  {
@@ -13450,7 +14051,7 @@ index a92c9594a2d3..c267b92b837b 100644
  	struct ksm_rmap_item *tree_rmap_item;
  	struct page *tree_page = NULL;
  	struct ksm_stable_node *stable_node;
-@@ -2092,6 +2177,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2092,6 +2184,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	}
  
  	remove_rmap_item_from_tree(rmap_item);
@@ -13458,7 +14059,7 @@ index a92c9594a2d3..c267b92b837b 100644
  
  	if (kpage) {
  		if (PTR_ERR(kpage) == -EBUSY)
-@@ -2128,29 +2214,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2128,29 +2221,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	 * Same checksum as an empty page. We attempt to merge it with the
  	 * appropriate zero page if the user enabled this via sysfs.
  	 */
@@ -13495,7 +14096,7 @@ index a92c9594a2d3..c267b92b837b 100644
  	tree_rmap_item =
  		unstable_tree_search_insert(rmap_item, page, &tree_page);
  	if (tree_rmap_item) {
-@@ -2214,23 +2287,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
+@@ -2214,23 +2294,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite
  	}
  }
  
@@ -13541,7 +14142,7 @@ index a92c9594a2d3..c267b92b837b 100644
  	rmap_item = alloc_rmap_item();
  	if (rmap_item) {
  		/* It has already been zeroed */
-@@ -2337,6 +2426,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
+@@ -2337,6 +2433,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
  			}
  			if (is_zone_device_page(*page))
  				goto next_page;
@@ -13564,7 +14165,7 @@ index a92c9594a2d3..c267b92b837b 100644
  			if (PageAnon(*page)) {
  				flush_anon_page(vma, *page, ksm_scan.address);
  				flush_dcache_page(*page);
-@@ -3138,6 +3243,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
+@@ -3138,6 +3250,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
  }
  KSM_ATTR_RO(pages_volatile);
  
@@ -13578,7 +14179,7 @@ index a92c9594a2d3..c267b92b837b 100644
  static ssize_t stable_node_dups_show(struct kobject *kobj,
  				     struct kobj_attribute *attr, char *buf)
  {
-@@ -3193,6 +3305,7 @@ static struct attribute *ksm_attrs[] = {
+@@ -3193,6 +3312,7 @@ static struct attribute *ksm_attrs[] = {
  	&pages_sharing_attr.attr,
  	&pages_unshared_attr.attr,
  	&pages_volatile_attr.attr,
@@ -13586,6 +14187,60 @@ index a92c9594a2d3..c267b92b837b 100644
  	&full_scans_attr.attr,
  #ifdef CONFIG_NUMA
  	&merge_across_nodes_attr.attr,
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index 3bb3484563ed..3aec9a6a9cb7 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ {
+ 	unsigned long flags;
+ 	int i, allocated = 0;
++	struct list_head *prev_tail = list->prev;
++	struct page *pos, *n;
+ 
+ 	spin_lock_irqsave(&zone->lock, flags);
+ 	for (i = 0; i < count; ++i) {
+@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		if (unlikely(page == NULL))
+ 			break;
+ 
+-		if (unlikely(check_pcp_refill(page, order)))
+-			continue;
+-
+ 		/*
+ 		 * Split buddy pages returned by expand() are received here in
+ 		 * physical page order. The page is added to the tail of
+@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 		 * pages are ordered properly.
+ 		 */
+ 		list_add_tail(&page->pcp_list, list);
+-		allocated++;
+ 		if (is_migrate_cma(get_pcppage_migratetype(page)))
+ 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
+ 					      -(1 << order));
+@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
+ 	 */
+ 	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
+ 	spin_unlock_irqrestore(&zone->lock, flags);
++
++	/*
++	 * Pages are appended to the pcp list without checking to reduce the
++	 * time holding the zone lock. Checking the appended pages happens right
++	 * after the critical section while still holding the pcp lock.
++	 */
++	pos = list_first_entry(prev_tail, struct page, pcp_list);
++	list_for_each_entry_safe_from(pos, n, list, pcp_list) {
++		if (unlikely(check_pcp_refill(pos, order))) {
++			list_del(&pos->pcp_list);
++			continue;
++		}
++
++		allocated++;
++	}
++
+ 	return allocated;
+ }
+ 
 diff --git a/mm/z3fold.c b/mm/z3fold.c
 index a4de0c317ac7..0cef845d397b 100644
 --- a/mm/z3fold.c
@@ -13813,12 +14468,12 @@ index b11b7e5115dc..3033cd6ed3b4 100644
  #ifdef __NR_userfaultfd
  	test_unmerge_uffd_wp();
 -- 
-2.39.2
+2.40.0.rc2
 
-From b773e9f32d0254f398a29134cec883652e3c4201 Mon Sep 17 00:00:00 2001
+From 50de9c32a97f479390ff525d679f224e1ceb8e3b Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 13 Feb 2023 11:27:09 +0100
-Subject: [PATCH 07/15] fs-patches
+Date: Fri, 3 Mar 2023 16:59:32 +0100
+Subject: [PATCH 07/16] fs-patches
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -13826,11 +14481,11 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  block/blk-merge.c                 |   3 +-
  fs/btrfs/Makefile                 |   6 +-
  fs/btrfs/backref.c                |  33 +-
- fs/btrfs/bio.c                    | 557 +++++++++++++++++++++---
+ fs/btrfs/bio.c                    | 557 ++++++++++++++++++++---
  fs/btrfs/bio.h                    |  67 +--
- fs/btrfs/block-group.c            | 273 ++++++++++--
+ fs/btrfs/block-group.c            | 276 ++++++++++--
  fs/btrfs/block-group.h            |  24 +-
- fs/btrfs/btrfs_inode.h            |  22 +-
+ fs/btrfs/btrfs_inode.h            |  23 +-
  fs/btrfs/compression.c            | 276 ++----------
  fs/btrfs/compression.h            |   3 -
  fs/btrfs/ctree.c                  |  62 ++-
@@ -13838,48 +14493,53 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  fs/btrfs/defrag.c                 |   4 +-
  fs/btrfs/delayed-ref.c            |  24 +-
  fs/btrfs/delayed-ref.h            |   2 +-
- fs/btrfs/discard.c                |  41 +-
- fs/btrfs/disk-io.c                | 225 +---------
+ fs/btrfs/disk-io.c                | 222 +---------
  fs/btrfs/disk-io.h                |  14 +-
  fs/btrfs/extent-io-tree.c         |  10 +-
  fs/btrfs/extent-io-tree.h         |   1 -
  fs/btrfs/extent-tree.c            | 181 +++-----
  fs/btrfs/extent-tree.h            |  81 ++++
- fs/btrfs/extent_io.c              | 582 +++----------------------
+ fs/btrfs/extent_io.c              | 582 +++---------------------
  fs/btrfs/extent_io.h              |  36 +-
- fs/btrfs/file-item.c              |  72 ++--
+ fs/btrfs/file-item.c              |  72 ++-
  fs/btrfs/file-item.h              |   8 +-
- fs/btrfs/file.c                   |   2 +-
+ fs/btrfs/file.c                   |  13 +-
  fs/btrfs/free-space-tree.c        |   2 +-
- fs/btrfs/fs.c                     |   4 +
- fs/btrfs/fs.h                     |  11 +-
- fs/btrfs/inode.c                  | 641 ++++------------------------
+ fs/btrfs/fs.h                     |   5 +-
+ fs/btrfs/inode.c                  | 715 ++++++------------------------
  fs/btrfs/ioctl.c                  |   2 +-
- fs/btrfs/lru_cache.c              | 166 ++++++++
+ fs/btrfs/lru_cache.c              | 166 +++++++
  fs/btrfs/lru_cache.h              |  80 ++++
  fs/btrfs/lzo.c                    |   2 +-
  fs/btrfs/messages.c               |  30 --
  fs/btrfs/messages.h               |  34 --
- fs/btrfs/ordered-data.c           |  25 +-
- fs/btrfs/ordered-data.h           |   3 +-
+ fs/btrfs/ordered-data.c           |  71 ++-
+ fs/btrfs/ordered-data.h           |  10 +-
  fs/btrfs/qgroup.c                 |   2 +-
- fs/btrfs/raid56.c                 | 334 ++++++---------
+ fs/btrfs/raid56.c                 | 334 +++++---------
  fs/btrfs/raid56.h                 |   4 +-
  fs/btrfs/relocation.c             |   2 +-
- fs/btrfs/scrub.c                  |  51 ++-
- fs/btrfs/send.c                   | 684 ++++++++++++++++--------------
+ fs/btrfs/scrub.c                  |   2 +-
+ fs/btrfs/send.c                   | 684 ++++++++++++++--------------
  fs/btrfs/super.c                  |   3 +-
- fs/btrfs/sysfs.c                  |  41 +-
- fs/btrfs/sysfs.h                  |   3 +-
+ fs/btrfs/sysfs.c                  |  12 +-
  fs/btrfs/tests/extent-map-tests.c |   2 +-
- fs/btrfs/transaction.c            |  34 ++
+ fs/btrfs/transaction.c            |  29 ++
  fs/btrfs/transaction.h            |  31 ++
  fs/btrfs/tree-log.c               |  87 ++--
  fs/btrfs/tree-log.h               |   9 +-
  fs/btrfs/volumes.c                | 116 ++---
  fs/btrfs/volumes.h                |  18 -
- fs/btrfs/zoned.c                  | 146 +++----
+ fs/btrfs/zoned.c                  | 146 +++---
  fs/btrfs/zoned.h                  |  20 +-
+ fs/ext4/extents.c                 |   2 +-
+ fs/ext4/file.c                    |  34 +-
+ fs/ext4/inode.c                   | 429 ++++++------------
+ fs/ext4/ioctl.c                   |   3 -
+ fs/ext4/namei.c                   |  11 +-
+ fs/ext4/page-io.c                 |  10 +-
+ fs/ext4/super.c                   |  26 +-
+ fs/ext4/xattr.c                   | 137 ++++--
  fs/gfs2/bmap.c                    |  38 +-
  fs/iomap/buffered-io.c            |  91 ++--
  fs/iomap/direct-io.c              |  10 +-
@@ -13887,25 +14547,27 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  fs/xfs/libxfs/xfs_bmap.c          |  32 +-
  fs/xfs/libxfs/xfs_bmap.h          |   5 +-
  fs/xfs/libxfs/xfs_btree.c         |  18 +-
- fs/xfs/libxfs/xfs_refcount.c      |  96 ++---
+ fs/xfs/libxfs/xfs_refcount.c      |  96 ++--
  fs/xfs/libxfs/xfs_refcount.h      |   4 +-
- fs/xfs/libxfs/xfs_rmap.c          |  50 ++-
+ fs/xfs/libxfs/xfs_rmap.c          |  50 +--
  fs/xfs/libxfs/xfs_rmap.h          |   6 +-
  fs/xfs/xfs_bmap_item.c            | 137 +++---
  fs/xfs/xfs_error.c                |   2 +-
  fs/xfs/xfs_error.h                |  12 +-
  fs/xfs/xfs_extfree_item.c         |  99 +++--
+ fs/xfs/xfs_fsmap.c                |   1 +
  fs/xfs/xfs_globals.c              |   3 +-
  fs/xfs/xfs_iomap.c                |   4 +-
  fs/xfs/xfs_refcount_item.c        | 110 +++--
- fs/xfs/xfs_rmap_item.c            | 142 +++----
+ fs/xfs/xfs_rmap_item.c            | 142 +++---
  fs/xfs/xfs_sysfs.c                |  12 +-
  fs/xfs/xfs_sysfs.h                |  10 +-
  fs/xfs/xfs_trace.h                |  15 +-
  include/linux/bio.h               |   4 +
  include/linux/iomap.h             |  30 +-
  include/trace/events/btrfs.h      | 127 +++++-
- 83 files changed, 2936 insertions(+), 3366 deletions(-)
+ include/trace/events/ext4.h       |   7 -
+ 90 files changed, 3213 insertions(+), 3751 deletions(-)
  create mode 100644 fs/btrfs/lru_cache.c
  create mode 100644 fs/btrfs/lru_cache.h
 
@@ -13923,7 +14585,7 @@ index 8de008c0c5ad..e2561416391c 100644
  	OR together the tags which represent errors which should cause panics:
  
 diff --git a/block/blk-merge.c b/block/blk-merge.c
-index b7c193d67185..64bf7d9dd8e8 100644
+index 808b58129d3e..1ac782fdc55c 100644
 --- a/block/blk-merge.c
 +++ b/block/blk-merge.c
 @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim,
@@ -14866,7 +15528,7 @@ index b12f84b3b341..873ff85817f0 100644
  			    u64 length, u64 logical, struct page *page,
  			    unsigned int pg_offset, int mirror_num);
 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
-index 708d843daa72..5b10401d803b 100644
+index 708d843daa72..80c73137e322 100644
 --- a/fs/btrfs/block-group.c
 +++ b/fs/btrfs/block-group.c
 @@ -1,5 +1,6 @@
@@ -15038,7 +15700,17 @@ index 708d843daa72..5b10401d803b 100644
  	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
  		ret = load_free_space_cache(block_group);
  		if (ret == 1) {
-@@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+@@ -1687,7 +1836,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
+ 
+ 		btrfs_info(fs_info,
+ 			"reclaiming chunk %llu with %llu%% used %llu%% unusable",
+-				bg->start, div_u64(bg->used * 100, bg->length),
++				bg->start,
++				div64_u64(bg->used * 100, bg->length),
+ 				div64_u64(zone_unusable * 100, bg->length));
+ 		trace_btrfs_reclaim_block_group(bg);
+ 		ret = btrfs_relocate_chunk(fs_info, bg->start);
+@@ -1816,7 +1966,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
   *
   * @fs_info:       the filesystem
   * @chunk_start:   logical address of block group
@@ -15046,7 +15718,7 @@ index 708d843daa72..5b10401d803b 100644
   * @physical:	   physical address to map to logical addresses
   * @logical:	   return array of logical addresses which map to @physical
   * @naddrs:	   length of @logical
-@@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+@@ -1827,8 +1976,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
   * block copies.
   */
  int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
@@ -15056,7 +15728,7 @@ index 708d843daa72..5b10401d803b 100644
  {
  	struct extent_map *em;
  	struct map_lookup *map;
-@@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
+@@ -1868,9 +2016,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
  			      data_stripe_length))
  			continue;
  
@@ -15066,7 +15738,7 @@ index 708d843daa72..5b10401d803b 100644
  		stripe_nr = physical - map->stripes[i].physical;
  		stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
  
-@@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
+@@ -1927,7 +2072,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache)
  
  	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  		bytenr = btrfs_sb_offset(i);
@@ -15075,7 +15747,7 @@ index 708d843daa72..5b10401d803b 100644
  				       bytenr, &logical, &nr, &stripe_len);
  		if (ret)
  			return ret;
-@@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+@@ -3330,7 +3475,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
  	spin_unlock(&info->delalloc_root_lock);
  
  	while (total) {
@@ -15084,7 +15756,7 @@ index 708d843daa72..5b10401d803b 100644
  
  		cache = btrfs_lookup_block_group(info, bytenr);
  		if (!cache) {
-@@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+@@ -3379,6 +3524,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
  			cache->space_info->disk_used -= num_bytes * factor;
  
  			reclaim = should_reclaim_block_group(cache, num_bytes);
@@ -15092,7 +15764,7 @@ index 708d843daa72..5b10401d803b 100644
  			spin_unlock(&cache->lock);
  			spin_unlock(&cache->space_info->lock);
  
-@@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
+@@ -3433,32 +3579,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
   * reservation and return -EAGAIN, otherwise this function always succeeds.
   */
  int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
@@ -15151,7 +15823,7 @@ index 708d843daa72..5b10401d803b 100644
  	spin_unlock(&cache->lock);
  	spin_unlock(&space_info->lock);
  	return ret;
-@@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
+@@ -4218,3 +4374,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount
  	bg->swap_extents -= amount;
  	spin_unlock(&bg->lock);
  }
@@ -15287,7 +15959,7 @@ index a02ea76fd6cf..6e4a0b429ac3 100644
 +
  #endif /* BTRFS_BLOCK_GROUP_H */
 diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
-index 195c09e20609..49a92aa65de1 100644
+index 195c09e20609..87020aa58121 100644
 --- a/fs/btrfs/btrfs_inode.h
 +++ b/fs/btrfs/btrfs_inode.h
 @@ -93,12 +93,6 @@ struct btrfs_inode {
@@ -15328,6 +16000,14 @@ index 195c09e20609..49a92aa65de1 100644
  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
  			      u64 *orig_start, u64 *orig_block_len,
  			      u64 *ram_bytes, bool nowait, bool strict);
+@@ -532,6 +516,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
+ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
+ 		       size_t done_before);
+ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
++				  struct btrfs_ordered_extent **ordered_extent,
+ 				  size_t done_before);
+ 
+ extern const struct dentry_operations btrfs_dentry_operations;
 diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
 index 5122ca79f7ea..f42f31f22d13 100644
 --- a/fs/btrfs/compression.c
@@ -16095,129 +16775,8 @@ index d6304b690ec4..2eb34abf700f 100644
  			      struct btrfs_delayed_ref_root *delayed_refs,
  			      struct btrfs_delayed_ref_head *head);
  
-diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c
-index ff2e524d9937..317aeff6c1da 100644
---- a/fs/btrfs/discard.c
-+++ b/fs/btrfs/discard.c
-@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
- static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
- 				  struct btrfs_block_group *block_group)
- {
-+	lockdep_assert_held(&discard_ctl->lock);
- 	if (!btrfs_run_discard_work(discard_ctl))
- 		return;
- 
-@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
- 						      BTRFS_DISCARD_DELAY);
- 		block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
- 	}
-+	if (list_empty(&block_group->discard_list))
-+		btrfs_get_block_group(block_group);
- 
- 	list_move_tail(&block_group->discard_list,
- 		       get_discard_list(discard_ctl, block_group));
-@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
- static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
- 				       struct btrfs_block_group *block_group)
- {
-+	bool queued;
-+
- 	spin_lock(&discard_ctl->lock);
- 
-+	queued = !list_empty(&block_group->discard_list);
-+
- 	if (!btrfs_run_discard_work(discard_ctl)) {
- 		spin_unlock(&discard_ctl->lock);
- 		return;
-@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
- 	block_group->discard_eligible_time = (ktime_get_ns() +
- 					      BTRFS_DISCARD_UNUSED_DELAY);
- 	block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
-+	if (!queued)
-+		btrfs_get_block_group(block_group);
- 	list_add_tail(&block_group->discard_list,
- 		      &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
- 
-@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
- 				     struct btrfs_block_group *block_group)
- {
- 	bool running = false;
-+	bool queued = false;
- 
- 	spin_lock(&discard_ctl->lock);
- 
-@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
- 	}
- 
- 	block_group->discard_eligible_time = 0;
-+	queued = !list_empty(&block_group->discard_list);
- 	list_del_init(&block_group->discard_list);
-+	/*
-+	 * If the block group is currently running in the discard workfn, we
-+	 * don't want to deref it, since it's still being used by the workfn.
-+	 * The workfn will notice this case and deref the block group when it is
-+	 * finished.
-+	 */
-+	if (queued && !running)
-+		btrfs_put_block_group(block_group);
- 
- 	spin_unlock(&discard_ctl->lock);
- 
-@@ -214,10 +233,12 @@ static struct btrfs_block_group *peek_discard_list(
- 	if (block_group && now >= block_group->discard_eligible_time) {
- 		if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
- 		    block_group->used != 0) {
--			if (btrfs_is_block_group_data_only(block_group))
-+			if (btrfs_is_block_group_data_only(block_group)) {
- 				__add_to_discard_list(discard_ctl, block_group);
--			else
-+			} else {
- 				list_del_init(&block_group->discard_list);
-+				btrfs_put_block_group(block_group);
-+			}
- 			goto again;
- 		}
- 		if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
-@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work)
- 	spin_lock(&discard_ctl->lock);
- 	discard_ctl->prev_discard = trimmed;
- 	discard_ctl->prev_discard_time = now;
-+	/*
-+	 * If the block group was removed from the discard list while it was
-+	 * running in this workfn, then we didn't deref it, since this function
-+	 * still owned that reference. But we set the discard_ctl->block_group
-+	 * back to NULL, so we can use that condition to know that now we need
-+	 * to deref the block_group.
-+	 */
-+	if (discard_ctl->block_group == NULL)
-+		btrfs_put_block_group(block_group);
- 	discard_ctl->block_group = NULL;
- 	__btrfs_discard_schedule_work(discard_ctl, now, false);
- 	spin_unlock(&discard_ctl->lock);
-@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
- 	list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
- 				 bg_list) {
- 		list_del_init(&block_group->bg_list);
--		btrfs_put_block_group(block_group);
- 		btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
-+		/*
-+		 * This put is for the get done by btrfs_mark_bg_unused.
-+		 * Queueing discard incremented it for discard's reference.
-+		 */
-+		btrfs_put_block_group(block_group);
- 	}
- 	spin_unlock(&fs_info->unused_bgs_lock);
- }
-@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
- 			if (block_group->used == 0)
- 				btrfs_mark_bg_unused(block_group);
- 			spin_lock(&discard_ctl->lock);
-+			btrfs_put_block_group(block_group);
- 		}
- 	}
- 	spin_unlock(&discard_ctl->lock);
 diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
-index 3aa04224315e..b53f0e30ce2b 100644
+index fde40112a259..b53f0e30ce2b 100644
 --- a/fs/btrfs/disk-io.c
 +++ b/fs/btrfs/disk-io.c
 @@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
@@ -16463,17 +17022,7 @@ index 3aa04224315e..b53f0e30ce2b 100644
  static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
  			 u64 objectid)
  {
-@@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg)
- 			goto sleep;
- 		}
- 
-+		if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
-+			btrfs_sysfs_feature_update(fs_info);
-+
- 		btrfs_run_delayed_iputs(fs_info);
- 
- 		again = btrfs_clean_one_deleted_snapshot(fs_info);
-@@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
+@@ -5162,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
  			start += fs_info->nodesize;
  			if (!eb)
  				continue;
@@ -18091,7 +18640,7 @@ index 031225668434..cd7f2ae515c0 100644
  			    struct list_head *list, int search_commit,
  			    bool nowait);
 diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
-index af046d22300e..5cc5a1faaef5 100644
+index af046d22300e..ec5c5355906b 100644
 --- a/fs/btrfs/file.c
 +++ b/fs/btrfs/file.c
 @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
@@ -18103,6 +18652,38 @@ index af046d22300e..5cc5a1faaef5 100644
  			btrfs_put_ordered_extent(ordered);
  			return -EAGAIN;
  		}
+@@ -1465,6 +1465,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ 	ssize_t err;
+ 	unsigned int ilock_flags = 0;
+ 	struct iomap_dio *dio;
++	struct btrfs_ordered_extent *ordered_extent = NULL;
+ 
+ 	if (iocb->ki_flags & IOCB_NOWAIT)
+ 		ilock_flags |= BTRFS_ILOCK_TRY;
+@@ -1526,7 +1527,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ 	 * got -EFAULT, faulting in the pages before the retry.
+ 	 */
+ 	from->nofault = true;
+-	dio = btrfs_dio_write(iocb, from, written);
++	dio = btrfs_dio_write(iocb, from, &ordered_extent, written);
+ 	from->nofault = false;
+ 
+ 	/*
+@@ -1569,6 +1570,14 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ 			goto relock;
+ 		}
+ 	}
++	/*
++	 * We can't loop back to btrfs_dio_write, so we can drop the cached
++	 * ordered extent. Typically btrfs_dio_iomap_end will run and put the
++	 * ordered_extent, but this is needed to clean up in case of an error
++	 * path breaking out of iomap_iter before the final iomap_end call.
++	 */
++	if (ordered_extent)
++		btrfs_put_ordered_extent(ordered_extent);
+ 
+ 	/*
+ 	 * If 'err' is -ENOTBLK or we have not written all data, then it means
 diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
 index c667e878ef1a..4d155a48ec59 100644
 --- a/fs/btrfs/free-space-tree.c
@@ -18116,43 +18697,8 @@ index c667e878ef1a..4d155a48ec59 100644
  	btrfs_tree_unlock(free_space_root->node);
  	btrfs_free_tree_block(trans, btrfs_root_id(free_space_root),
  			      free_space_root->node, 0, 1);
-diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c
-index 5553e1f8afe8..31c1648bc0b4 100644
---- a/fs/btrfs/fs.c
-+++ b/fs/btrfs/fs.c
-@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
- 				name, flag);
- 		}
- 		spin_unlock(&fs_info->super_lock);
-+		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
- 	}
- }
- 
-@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
- 				name, flag);
- 		}
- 		spin_unlock(&fs_info->super_lock);
-+		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
- 	}
- }
- 
-@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
- 				name, flag);
- 		}
- 		spin_unlock(&fs_info->super_lock);
-+		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
- 	}
- }
- 
-@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
- 				name, flag);
- 		}
- 		spin_unlock(&fs_info->super_lock);
-+		set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags);
- 	}
- }
 diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
-index 37b86acfcbcf..4c477eae6891 100644
+index 3d8156fc8523..4c477eae6891 100644
 --- a/fs/btrfs/fs.h
 +++ b/fs/btrfs/fs.h
 @@ -3,6 +3,7 @@
@@ -18163,20 +18709,7 @@ index 37b86acfcbcf..4c477eae6891 100644
  #include <linux/fs.h>
  #include <linux/btrfs_tree.h>
  #include <linux/sizes.h>
-@@ -125,6 +126,12 @@ enum {
- 	 */
- 	BTRFS_FS_NO_OVERCOMMIT,
- 
-+	/*
-+	 * Indicate if we have some features changed, this is mostly for
-+	 * cleaner thread to update the sysfs interface.
-+	 */
-+	BTRFS_FS_FEATURE_CHANGED,
-+
- #if BITS_PER_LONG == 32
- 	/* Indicate if we have error/warn message printed on 32bit systems */
- 	BTRFS_FS_32BIT_ERROR,
-@@ -742,8 +749,10 @@ struct btrfs_fs_info {
+@@ -748,8 +749,10 @@ struct btrfs_fs_info {
  	 */
  	u64 zone_size;
  
@@ -18189,10 +18722,14 @@ index 37b86acfcbcf..4c477eae6891 100644
  	spinlock_t treelog_bg_lock;
  	u64 treelog_bg;
 diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
-index 98a800b8bd43..44e9acc77a74 100644
+index 98a800b8bd43..6aaa892474be 100644
 --- a/fs/btrfs/inode.c
 +++ b/fs/btrfs/inode.c
-@@ -84,27 +84,12 @@ struct btrfs_dio_data {
+@@ -81,30 +81,16 @@ struct btrfs_dio_data {
+ 	struct extent_changeset *data_reserved;
+ 	bool data_space_reserved;
+ 	bool nocow_done;
++	struct btrfs_ordered_extent *ordered;
  };
  
  struct btrfs_dio_private {
@@ -18222,7 +18759,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  };
  
  static struct bio_set btrfs_dio_bioset;
-@@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
+@@ -228,7 +214,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  {
  	unsigned long index = offset >> PAGE_SHIFT;
  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
@@ -18231,7 +18768,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	struct page *page;
  
  	if (locked_page) {
-@@ -2535,19 +2520,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
+@@ -2535,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
  	}
  }
  
@@ -18251,7 +18788,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  /*
   * Split an extent_map at [start, start + len]
   *
-@@ -2663,19 +2635,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
+@@ -2663,19 +2636,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len,
  	return ret;
  }
  
@@ -18276,7 +18813,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	if (WARN_ON_ONCE(!ordered))
  		return BLK_STS_IOERR;
  
-@@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+@@ -2715,7 +2688,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
  	ret = btrfs_split_ordered_extent(ordered, pre, post);
  	if (ret)
  		goto out;
@@ -18285,7 +18822,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  
  out:
  	btrfs_put_ordered_extent(ordered);
-@@ -2723,75 +2695,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
+@@ -2723,75 +2696,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode,
  	return errno_to_blk_status(ret);
  }
  
@@ -18361,7 +18898,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  /*
   * given a list of ordered sums record them in the inode.  This happens
   * at IO completion time based on sums calculated at bio submission time.
-@@ -2969,7 +2872,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
+@@ -2969,7 +2873,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
  		unlock_extent(&inode->io_tree, page_start, page_end,
  			      &cached_state);
  		unlock_page(page);
@@ -18370,7 +18907,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  		btrfs_put_ordered_extent(ordered);
  		goto again;
  	}
-@@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+@@ -3259,15 +3163,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  		goto out;
  	}
  
@@ -18388,7 +18925,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
  		truncated = true;
  		logical_len = ordered_extent->truncated_len;
-@@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
+@@ -3474,109 +3376,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of
  }
  
  /*
@@ -18533,7 +19070,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  }
  
  /*
-@@ -4987,7 +4834,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
+@@ -4987,7 +4835,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
  		unlock_extent(io_tree, block_start, block_end, &cached_state);
  		unlock_page(page);
  		put_page(page);
@@ -18542,7 +19079,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  		btrfs_put_ordered_extent(ordered);
  		goto again;
  	}
-@@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode)
+@@ -5466,8 +5314,6 @@ void btrfs_evict_inode(struct inode *inode)
  	if (is_bad_inode(inode))
  		goto no_delete;
  
@@ -18551,7 +19088,71 @@ index 98a800b8bd43..44e9acc77a74 100644
  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
  		goto no_delete;
  
-@@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+@@ -7131,6 +6977,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
+ }
+ 
+ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
++						  struct btrfs_dio_data *dio_data,
+ 						  const u64 start,
+ 						  const u64 len,
+ 						  const u64 orig_start,
+@@ -7141,7 +6988,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ 						  const int type)
+ {
+ 	struct extent_map *em = NULL;
+-	int ret;
++	struct btrfs_ordered_extent *ordered;
+ 
+ 	if (type != BTRFS_ORDERED_NOCOW) {
+ 		em = create_io_em(inode, start, len, orig_start, block_start,
+@@ -7151,18 +6998,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ 		if (IS_ERR(em))
+ 			goto out;
+ 	}
+-	ret = btrfs_add_ordered_extent(inode, start, len, len, block_start,
+-				       block_len, 0,
+-				       (1 << type) |
+-				       (1 << BTRFS_ORDERED_DIRECT),
+-				       BTRFS_COMPRESS_NONE);
+-	if (ret) {
++	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
++					     block_start, block_len, 0,
++					     (1 << type) |
++					     (1 << BTRFS_ORDERED_DIRECT),
++					     BTRFS_COMPRESS_NONE);
++	if (IS_ERR(ordered)) {
+ 		if (em) {
+ 			free_extent_map(em);
+ 			btrfs_drop_extent_map_range(inode, start,
+ 						    start + len - 1, false);
+ 		}
+-		em = ERR_PTR(ret);
++		em = ERR_PTR(PTR_ERR(ordered));
++	} else {
++		ASSERT(!dio_data->ordered);
++		dio_data->ordered = ordered;
+ 	}
+  out:
+ 
+@@ -7170,6 +7020,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
+ }
+ 
+ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
++						  struct btrfs_dio_data *dio_data,
+ 						  u64 start, u64 len)
+ {
+ 	struct btrfs_root *root = inode->root;
+@@ -7185,7 +7036,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
+ 	if (ret)
+ 		return ERR_PTR(ret);
+ 
+-	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
++	em = btrfs_create_dio_extent(inode, dio_data,
++				     start, ins.offset, start,
+ 				     ins.objectid, ins.offset, ins.offset,
+ 				     ins.offset, BTRFS_ORDERED_REGULAR);
+ 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
+@@ -7392,7 +7244,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
  			 */
  			if (writing ||
  			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
@@ -18560,7 +19161,51 @@ index 98a800b8bd43..44e9acc77a74 100644
  			else
  				ret = nowait ? -EAGAIN : -ENOTBLK;
  			btrfs_put_ordered_extent(ordered);
-@@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+@@ -7530,7 +7382,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 		}
+ 		space_reserved = true;
+ 
+-		em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len,
++		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
+ 					      orig_start, block_start,
+ 					      len, orig_block_len,
+ 					      ram_bytes, type);
+@@ -7572,7 +7424,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
+ 			goto out;
+ 		space_reserved = true;
+ 
+-		em = btrfs_new_extent_direct(BTRFS_I(inode), start, len);
++		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
+ 		if (IS_ERR(em)) {
+ 			ret = PTR_ERR(em);
+ 			goto out;
+@@ -7676,6 +7528,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 		}
+ 	}
+ 
++	if (dio_data->ordered) {
++		ASSERT(write);
++		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
++				      dio_data->ordered->file_offset,
++				      dio_data->ordered->bytes_left);
++		if (IS_ERR(em)) {
++			ret = PTR_ERR(em);
++			goto err;
++		}
++		goto map_iomap;
++	}
+ 	memset(dio_data, 0, sizeof(*dio_data));
+ 
+ 	/*
+@@ -7817,6 +7680,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
+ 	else
+ 		free_extent_state(cached_state);
+ 
++map_iomap:
+ 	/*
+ 	 * Translate extent map information to iomap.
+ 	 * We trim the extents (and move the addr) even though iomap code does
+@@ -7833,10 +7697,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
  	iomap->offset = start;
  	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
  	iomap->length = len;
@@ -18571,7 +19216,37 @@ index 98a800b8bd43..44e9acc77a74 100644
  	free_extent_map(em);
  
  	return 0;
-@@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+@@ -7874,13 +7734,25 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ 	if (submitted < length) {
+ 		pos += submitted;
+ 		length -= submitted;
+-		if (write)
+-			btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+-						       pos, length, false);
+-		else
++		if (write) {
++			if (submitted == 0) {
++				btrfs_mark_ordered_io_finished(BTRFS_I(inode),
++							       NULL, pos,
++							       length, false);
++				btrfs_put_ordered_extent(dio_data->ordered);
++				dio_data->ordered = NULL;
++			}
++		} else {
+ 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
+ 				      pos + length - 1, NULL);
++		}
+ 		ret = -ENOTBLK;
++	} else {
++		/* On the last bio, release our cached ordered_extent. */
++		if (write) {
++			btrfs_put_ordered_extent(dio_data->ordered);
++			dio_data->ordered = NULL;
++		}
+ 	}
+ 
+ 	if (write)
+@@ -7888,267 +7760,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
  	return ret;
  }
  
@@ -18599,7 +19274,8 @@ index 98a800b8bd43..44e9acc77a74 100644
 -}
 -
 -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
--{
++static void btrfs_dio_end_io(struct btrfs_bio *bbio)
+ {
 -	struct btrfs_dio_private *dip = btrfs_bio(bio)->private;
 -
 -	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
@@ -18644,8 +19320,7 @@ index 98a800b8bd43..44e9acc77a74 100644
 -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
 -					      struct bio *bio,
 -					      u64 dio_file_offset)
-+static void btrfs_dio_end_io(struct btrfs_bio *bbio)
- {
+-{
 -	return btrfs_csum_one_bio(inode, bio, dio_file_offset, false);
 -}
 -
@@ -18667,7 +19342,7 @@ index 98a800b8bd43..44e9acc77a74 100644
 -
 -	if (bio_op(bio) == REQ_OP_READ)
 -		err = btrfs_check_read_dio_bio(dip, bbio, !err);
--
+ 
 -	if (err)
 -		dip->bio.bi_status = err;
 -
@@ -18676,7 +19351,7 @@ index 98a800b8bd43..44e9acc77a74 100644
 -	bio_put(bio);
 -	btrfs_dio_private_put(dip);
 -}
- 
+-
 -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode,
 -				 u64 file_offset, int async_submit)
 -{
@@ -18797,12 +19472,10 @@ index 98a800b8bd43..44e9acc77a74 100644
 -			status = errno_to_blk_status(ret);
 -			goto out_err_em;
 -		}
- 
+-
 -		clone_len = min(submit_len, geom.len);
 -		ASSERT(clone_len <= UINT_MAX);
-+	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
-+	bbio->file_offset = file_offset;
- 
+-
 -		/*
 -		 * This will never fail as it's passing GPF_NOFS and
 -		 * the allocation is backed by btrfs_bioset.
@@ -18843,9 +19516,11 @@ index 98a800b8bd43..44e9acc77a74 100644
 -			if (!raid56)
 -				async_submit = 1;
 -		}
--
+ 
 -		btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit);
--
++	btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private);
++	bbio->file_offset = file_offset;
+ 
 -		dio_data->submitted += clone_len;
 -		clone_offset += clone_len;
 -		start_sector += clone_len >> 9;
@@ -18867,7 +19542,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  }
  
  static const struct iomap_ops btrfs_dio_iomap_ops = {
-@@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {
+@@ -8157,25 +7809,30 @@ static const struct iomap_ops btrfs_dio_iomap_ops = {
  };
  
  static const struct iomap_dio_ops btrfs_dio_ops = {
@@ -18876,7 +19551,34 @@ index 98a800b8bd43..44e9acc77a74 100644
  	.bio_set		= &btrfs_dio_bioset,
  };
  
-@@ -8552,7 +8173,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
+ ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
+ {
+-	struct btrfs_dio_data data;
++	struct btrfs_dio_data data = { 0 };
+ 
+ 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+ 			    IOMAP_DIO_PARTIAL, &data, done_before);
+ }
+ 
+ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
++				  struct btrfs_ordered_extent **ordered_extent,
+ 				  size_t done_before)
+ {
+-	struct btrfs_dio_data data;
++	struct btrfs_dio_data dio_data = { .ordered = *ordered_extent };
++	struct iomap_dio *dio;
+ 
+-	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+-			    IOMAP_DIO_PARTIAL, &data, done_before);
++	dio =  __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
++			      IOMAP_DIO_PARTIAL, &dio_data, done_before);
++	if (!IS_ERR_OR_NULL(dio))
++		*ordered_extent = dio_data.ordered;
++	return dio;
+ }
+ 
+ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+@@ -8552,7 +8209,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
  		unlock_extent(io_tree, page_start, page_end, &cached_state);
  		unlock_page(page);
  		up_read(&BTRFS_I(inode)->i_mmap_lock);
@@ -18885,7 +19587,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  		btrfs_put_ordered_extent(ordered);
  		goto again;
  	}
-@@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+@@ -8850,7 +8507,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
  	ei->last_log_commit = 0;
  
  	spin_lock_init(&ei->lock);
@@ -18893,7 +19595,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	ei->outstanding_extents = 0;
  	if (sb->s_magic != BTRFS_TEST_MAGIC)
  		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
-@@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
+@@ -8870,7 +8526,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
  	ei->io_tree.inode = ei;
  	extent_io_tree_init(fs_info, &ei->file_extent_tree,
  			    IO_TREE_INODE_FILE_EXTENT);
@@ -18901,7 +19603,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	atomic_set(&ei->sync_writers, 0);
  	mutex_init(&ei->log_mutex);
  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-@@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void)
+@@ -8994,7 +8649,7 @@ int __init btrfs_init_cachep(void)
  		goto fail;
  
  	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
@@ -18910,7 +19612,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  			BIOSET_NEED_BVECS))
  		goto fail;
  
-@@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private {
+@@ -10289,65 +9944,13 @@ struct btrfs_encoded_read_private {
  	wait_queue_head_t wait;
  	atomic_t pending;
  	blk_status_t status;
@@ -18977,7 +19679,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  		/*
  		 * The memory barrier implied by the atomic_dec_return() here
  		 * pairs with the memory barrier implied by the
-@@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
+@@ -10356,11 +9959,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
  		 * write is observed before the load of status in
  		 * btrfs_encoded_read_regular_fill_pages().
  		 */
@@ -18990,7 +19692,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	bio_put(&bbio->bio);
  }
  
-@@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+@@ -10368,47 +9970,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
  					  u64 file_offset, u64 disk_bytenr,
  					  u64 disk_io_size, struct page **pages)
  {
@@ -19041,7 +19743,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  						      btrfs_encoded_read_endio,
  						      &priv);
  				bio->bi_iter.bi_sector =
-@@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+@@ -10417,14 +9998,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
  
  			if (!bytes ||
  			    bio_add_page(bio, pages[i], bytes, 0) < bytes) {
@@ -19058,7 +19760,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  				bio = NULL;
  				continue;
  			}
-@@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+@@ -10435,7 +10010,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
  		}
  	}
  
@@ -19066,7 +19768,7 @@ index 98a800b8bd43..44e9acc77a74 100644
  	if (atomic_dec_return(&priv.pending))
  		io_wait_event(priv.wait, !atomic_read(&priv.pending));
  	/* See btrfs_encoded_read_endio() for ordering. */
-@@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
+@@ -10995,9 +10569,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
  		return 0;
  
  	max_pages = sis->max - bsi->nr_pages;
@@ -19456,10 +20158,89 @@ index 190af1f698d9..8c516ee58ff9 100644
  	__btrfs_handle_fs_error((fs_info), __func__, __LINE__,		\
  				(errno), fmt, ##args)
 diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
-index 57d8c72737e1..6c24b69e2d0a 100644
+index 57d8c72737e1..1848d0d1a9c4 100644
 --- a/fs/btrfs/ordered-data.c
 +++ b/fs/btrfs/ordered-data.c
-@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
+@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+  * @compress_type:   Compression algorithm used for data.
+  *
+  * Most of these parameters correspond to &struct btrfs_file_extent_item. The
+- * tree is given a single reference on the ordered extent that was inserted.
++ * tree is given a single reference on the ordered extent that was inserted, and
++ * the returned pointer is given a second reference.
+  *
+- * Return: 0 or -ENOMEM.
++ * Return: the new ordered extent or ERR_PTR(-ENOMEM).
+  */
+-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+-			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+-			     u64 disk_num_bytes, u64 offset, unsigned flags,
+-			     int compress_type)
++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
++			struct btrfs_inode *inode, u64 file_offset,
++			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			u64 disk_num_bytes, u64 offset, unsigned long flags,
++			int compress_type)
+ {
+ 	struct btrfs_root *root = inode->root;
+ 	struct btrfs_fs_info *fs_info = root->fs_info;
+@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 		/* For nocow write, we can release the qgroup rsv right now */
+ 		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
+ 		if (ret < 0)
+-			return ret;
++			return ERR_PTR(ret);
+ 		ret = 0;
+ 	} else {
+ 		/*
+@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 		 */
+ 		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
+ 		if (ret < 0)
+-			return ret;
++			return ERR_PTR(ret);
+ 	}
+ 	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
+ 	if (!entry)
+-		return -ENOMEM;
++		return ERR_PTR(-ENOMEM);
+ 
+ 	entry->file_offset = file_offset;
+ 	entry->num_bytes = num_bytes;
+@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 	btrfs_mod_outstanding_extents(inode, 1);
+ 	spin_unlock(&inode->lock);
+ 
++	/* One ref for the returned entry to match semantics of lookup. */
++	refcount_inc(&entry->refs);
++
++	return entry;
++}
++
++/*
++ * Add a new btrfs_ordered_extent for the range, but drop the reference instead
++ * of returning it to the caller.
++ */
++int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
++			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			     u64 disk_num_bytes, u64 offset, unsigned long flags,
++			     int compress_type)
++{
++	struct btrfs_ordered_extent *ordered;
++
++	ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes,
++					     ram_bytes, disk_bytenr,
++					     disk_num_bytes, offset, flags,
++					     compress_type);
++
++	if (IS_ERR(ordered))
++		return PTR_ERR(ordered);
++	btrfs_put_ordered_extent(ordered);
++
+ 	return 0;
+ }
+ 
+@@ -616,7 +644,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
  	struct btrfs_ordered_extent *ordered;
  
  	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
@@ -19468,7 +20249,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644
  	complete(&ordered->completion);
  }
  
-@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
+@@ -716,13 +744,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
  }
  
  /*
@@ -19486,7 +20267,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644
  {
  	u64 start = entry->file_offset;
  	u64 end = start + entry->num_bytes - 1;
-@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
+@@ -744,12 +771,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
  	 */
  	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
  		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
@@ -19503,7 +20284,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644
  }
  
  /*
-@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+@@ -800,7 +825,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
  			btrfs_put_ordered_extent(ordered);
  			break;
  		}
@@ -19512,7 +20293,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644
  		end = ordered->file_offset;
  		/*
  		 * If the ordered extent had an error save the error but don't
-@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
+@@ -1061,7 +1086,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
  			break;
  		}
  		unlock_extent(&inode->io_tree, start, end, cachedp);
@@ -19522,7 +20303,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644
  	}
  }
 diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
-index 89f82b78f590..eb40cb39f842 100644
+index 89f82b78f590..18007f9c00ad 100644
 --- a/fs/btrfs/ordered-data.h
 +++ b/fs/btrfs/ordered-data.h
 @@ -157,7 +157,6 @@ struct btrfs_ordered_extent {
@@ -19533,7 +20314,21 @@ index 89f82b78f590..eb40cb39f842 100644
  };
  
  static inline void
-@@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
+@@ -179,15 +178,20 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
+ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
+ 				    struct btrfs_ordered_extent **cached,
+ 				    u64 file_offset, u64 io_size);
++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent(
++			struct btrfs_inode *inode, u64 file_offset,
++			u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
++			u64 disk_num_bytes, u64 offset, unsigned long flags,
++			int compress_type);
+ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
+ 			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
+-			     u64 disk_num_bytes, u64 offset, unsigned flags,
++			     u64 disk_num_bytes, u64 offset, unsigned long flags,
+ 			     int compress_type);
+ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
  			   struct btrfs_ordered_sum *sum);
  struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
  							 u64 file_offset);
@@ -20182,7 +20977,7 @@ index 31ec4a7658ce..ef13a9d4e370 100644
  		struct btrfs_fs_info *fs_info = inode->root->fs_info;
  		const u32 sectorsize = fs_info->sectorsize;
 diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
-index 52b346795f66..69c93ae333f6 100644
+index a5d026041be4..69c93ae333f6 100644
 --- a/fs/btrfs/scrub.c
 +++ b/fs/btrfs/scrub.c
 @@ -229,7 +229,7 @@ struct full_stripe_lock {
@@ -20194,77 +20989,6 @@ index 52b346795f66..69c93ae333f6 100644
  struct scrub_page_private {
  	u64 logical;
  };
-@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
- 	 * a) don't have an extent buffer and
- 	 * b) the page is already kmapped
- 	 */
--	if (sblock->logical != btrfs_stack_header_bytenr(h))
-+	if (sblock->logical != btrfs_stack_header_bytenr(h)) {
- 		sblock->header_error = 1;
--
--	if (sector->generation != btrfs_stack_header_generation(h)) {
--		sblock->header_error = 1;
--		sblock->generation_error = 1;
-+		btrfs_warn_rl(fs_info,
-+		"tree block %llu mirror %u has bad bytenr, has %llu want %llu",
-+			      sblock->logical, sblock->mirror_num,
-+			      btrfs_stack_header_bytenr(h),
-+			      sblock->logical);
-+		goto out;
- 	}
- 
--	if (!scrub_check_fsid(h->fsid, sector))
-+	if (!scrub_check_fsid(h->fsid, sector)) {
- 		sblock->header_error = 1;
-+		btrfs_warn_rl(fs_info,
-+		"tree block %llu mirror %u has bad fsid, has %pU want %pU",
-+			      sblock->logical, sblock->mirror_num,
-+			      h->fsid, sblock->dev->fs_devices->fsid);
-+		goto out;
-+	}
- 
--	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
--		   BTRFS_UUID_SIZE))
-+	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) {
- 		sblock->header_error = 1;
-+		btrfs_warn_rl(fs_info,
-+		"tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU",
-+			      sblock->logical, sblock->mirror_num,
-+			      h->chunk_tree_uuid, fs_info->chunk_tree_uuid);
-+		goto out;
-+	}
- 
- 	shash->tfm = fs_info->csum_shash;
- 	crypto_shash_init(shash);
-@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
- 	}
- 
- 	crypto_shash_final(shash, calculated_csum);
--	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
-+	if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) {
- 		sblock->checksum_error = 1;
-+		btrfs_warn_rl(fs_info,
-+		"tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT,
-+			      sblock->logical, sblock->mirror_num,
-+			      CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum),
-+			      CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum));
-+		goto out;
-+	}
-+
-+	if (sector->generation != btrfs_stack_header_generation(h)) {
-+		sblock->header_error = 1;
-+		sblock->generation_error = 1;
-+		btrfs_warn_rl(fs_info,
-+		"tree block %llu mirror %u has bad generation, has %llu want %llu",
-+			      sblock->logical, sblock->mirror_num,
-+			      btrfs_stack_header_generation(h),
-+			      sector->generation);
-+	}
- 
-+out:
- 	return sblock->header_error || sblock->checksum_error;
- }
- 
 diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
 index d50182b6deec..e5c963bb873d 100644
 --- a/fs/btrfs/send.c
@@ -21491,7 +22215,7 @@ index 433ce221dc5c..581845bc206a 100644
  		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
  			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
 diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
-index 45615ce36498..8c5efa5813b3 100644
+index 108aa3876186..8c5efa5813b3 100644
 --- a/fs/btrfs/sysfs.c
 +++ b/fs/btrfs/sysfs.c
 @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj)
@@ -21548,65 +22272,6 @@ index 45615ce36498..8c5efa5813b3 100644
  	.sysfs_ops = &kobj_sysfs_ops,
  	.release = qgroup_release,
  	.default_groups = qgroup_groups,
-@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info,
-  * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
-  * values in superblock. Call after any changes to incompat/compat_ro flags
-  */
--void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
--		u64 bit, enum btrfs_feature_set set)
-+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info)
- {
--	struct btrfs_fs_devices *fs_devs;
- 	struct kobject *fsid_kobj;
--	u64 __maybe_unused features;
--	int __maybe_unused ret;
-+	int ret;
- 
- 	if (!fs_info)
- 		return;
- 
--	/*
--	 * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not
--	 * safe when called from some contexts (eg. balance)
--	 */
--	features = get_features(fs_info, set);
--	ASSERT(bit & supported_feature_masks[set]);
--
--	fs_devs = fs_info->fs_devices;
--	fsid_kobj = &fs_devs->fsid_kobj;
--
-+	fsid_kobj = &fs_info->fs_devices->fsid_kobj;
- 	if (!fsid_kobj->state_initialized)
- 		return;
- 
--	/*
--	 * FIXME: this is too heavy to update just one value, ideally we'd like
--	 * to use sysfs_update_group but some refactoring is needed first.
--	 */
--	sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
--	ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
-+	ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group);
-+	if (ret < 0)
-+		btrfs_warn(fs_info,
-+			   "failed to update /sys/fs/btrfs/%pU/features: %d",
-+			   fs_info->fs_devices->fsid, ret);
- }
- 
- int __init btrfs_init_sysfs(void)
-diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
-index bacef43f7267..86c7eef12873 100644
---- a/fs/btrfs/sysfs.h
-+++ b/fs/btrfs/sysfs.h
-@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device);
- int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs);
- void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
- void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices);
--void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
--		u64 bit, enum btrfs_feature_set set);
-+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info);
- void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action);
- 
- int __init btrfs_init_sysfs(void);
 diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
 index c5b3a631bf4f..f2f2e11dac4c 100644
 --- a/fs/btrfs/tests/extent-map-tests.c
@@ -21621,22 +22286,10 @@ index c5b3a631bf4f..f2f2e11dac4c 100644
  	if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
  		test_err("didn't rmap anything but expected %d",
 diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
-index b8c52e89688c..18329ebcb1cb 100644
+index 8f8d0fce6e4a..18329ebcb1cb 100644
 --- a/fs/btrfs/transaction.c
 +++ b/fs/btrfs/transaction.c
-@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
- 	wake_up(&fs_info->transaction_wait);
- 	btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED);
- 
-+	/* If we have features changed, wake up the cleaner to update sysfs. */
-+	if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) &&
-+	    fs_info->cleaner_kthread)
-+		wake_up_process(fs_info->cleaner_kthread);
-+
- 	ret = btrfs_write_and_wait_transaction(trans);
- 	if (ret) {
- 		btrfs_handle_fs_error(fs_info, ret,
-@@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
+@@ -2609,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
  	return (ret < 0) ? 0 : 1;
  }
  
@@ -22522,6 +23175,1091 @@ index f43990985d80..c0570d35fea2 100644
  static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
  {
  	return true;
+diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
+index 9de1c9d1a13d..3559ea6b0781 100644
+--- a/fs/ext4/extents.c
++++ b/fs/ext4/extents.c
+@@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle,
+ 		ext4_ext_mark_unwritten(ex2);
+ 
+ 	err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags);
+-	if (err != -ENOSPC && err != -EDQUOT)
++	if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM)
+ 		goto out;
+ 
+ 	if (EXT4_EXT_MAY_ZEROOUT & split_flag) {
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index 7ac0a81bd371..6e9f198ecacf 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
+ 	return false;
+ }
+ 
+-/* Is IO overwriting allocated and initialized blocks? */
+-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
++/* Is IO overwriting allocated or initialized blocks? */
++static bool ext4_overwrite_io(struct inode *inode,
++			      loff_t pos, loff_t len, bool *unwritten)
+ {
+ 	struct ext4_map_blocks map;
+ 	unsigned int blkbits = inode->i_blkbits;
+@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+ 	blklen = map.m_len;
+ 
+ 	err = ext4_map_blocks(NULL, inode, &map, 0);
++	if (err != blklen)
++		return false;
+ 	/*
+ 	 * 'err==len' means that all of the blocks have been preallocated,
+-	 * regardless of whether they have been initialized or not. To exclude
+-	 * unwritten extents, we need to check m_flags.
++	 * regardless of whether they have been initialized or not. We need to
++	 * check m_flags to distinguish the unwritten extents.
+ 	 */
+-	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
++	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
++	return true;
+ }
+ 
+ static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
+@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
+  * - For extending writes case we don't take the shared lock, since it requires
+  *   updating inode i_disksize and/or orphan handling with exclusive lock.
+  *
+- * - shared locking will only be true mostly with overwrites. Otherwise we will
+- *   switch to exclusive i_rwsem lock.
++ * - shared locking will only be true mostly with overwrites, including
++ *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
++ *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
++ *   also release exclusive i_rwsem lock.
++ *
++ * - Otherwise we will switch to exclusive i_rwsem lock.
+  */
+ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+-				     bool *ilock_shared, bool *extend)
++				     bool *ilock_shared, bool *extend,
++				     bool *unwritten)
+ {
+ 	struct file *file = iocb->ki_filp;
+ 	struct inode *inode = file_inode(file);
+@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
+ 	 * in file_modified().
+ 	 */
+ 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
+-	     !ext4_overwrite_io(inode, offset, count))) {
++	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
+ 		if (iocb->ki_flags & IOCB_NOWAIT) {
+ 			ret = -EAGAIN;
+ 			goto out;
+@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 	loff_t offset = iocb->ki_pos;
+ 	size_t count = iov_iter_count(from);
+ 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
+-	bool extend = false, unaligned_io = false;
++	bool extend = false, unaligned_io = false, unwritten = false;
+ 	bool ilock_shared = true;
+ 
+ 	/*
+@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 		return ext4_buffered_write_iter(iocb, from);
+ 	}
+ 
+-	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
++	ret = ext4_dio_write_checks(iocb, from,
++				    &ilock_shared, &extend, &unwritten);
+ 	if (ret <= 0)
+ 		return ret;
+ 
+@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ 		ext4_journal_stop(handle);
+ 	}
+ 
+-	if (ilock_shared)
++	if (ilock_shared && !unwritten)
+ 		iomap_ops = &ext4_iomap_overwrite_ops;
+ 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+ 			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
+diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
+index 9d9f414f99fe..24128f6cd1b0 100644
+--- a/fs/ext4/inode.c
++++ b/fs/ext4/inode.c
+@@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
+ 						   new_size);
+ }
+ 
+-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ 				  int pextents);
+ 
+@@ -1005,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
+ 	return ret;
+ }
+ 
+-/*
+- * To preserve ordering, it is essential that the hole instantiation and
+- * the data write be encapsulated in a single transaction.  We cannot
+- * close off a transaction and start a new one between the ext4_get_block()
+- * and the commit_write().  So doing the jbd2_journal_start at the start of
+- * prepare_write() is the right place.
+- *
+- * Also, this function can nest inside ext4_writepage().  In that case, we
+- * *know* that ext4_writepage() has generated enough buffer credits to do the
+- * whole page.  So we won't block on the journal in that case, which is good,
+- * because the caller may be PF_MEMALLOC.
+- *
+- * By accident, ext4 can be reentered when a transaction is open via
+- * quota file writes.  If we were to commit the transaction while thus
+- * reentered, there can be a deadlock - we would be holding a quota
+- * lock, and the commit would never complete if another thread had a
+- * transaction open and was blocking on the quota lock - a ranking
+- * violation.
+- *
+- * So what we do is to rely on the fact that jbd2_journal_stop/journal_start
+- * will _not_ run commit under these circumstances because handle->h_ref
+- * is elevated.  We'll still have enough credits for the tiny quotafile
+- * write.
+- */
+ int do_journal_get_write_access(handle_t *handle, struct inode *inode,
+ 				struct buffer_head *bh)
+ {
+@@ -1149,6 +1124,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
+ }
+ #endif
+ 
++/*
++ * To preserve ordering, it is essential that the hole instantiation and
++ * the data write be encapsulated in a single transaction.  We cannot
++ * close off a transaction and start a new one between the ext4_get_block()
++ * and the ext4_write_end().  So doing the jbd2_journal_start at the start of
++ * ext4_write_begin() is the right place.
++ */
+ static int ext4_write_begin(struct file *file, struct address_space *mapping,
+ 			    loff_t pos, unsigned len,
+ 			    struct page **pagep, void **fsdata)
+@@ -1649,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode)
+ 	return;
+ }
+ 
+-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode,
+-				      struct buffer_head *bh)
+-{
+-	return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
+-}
+-
+ /*
+  * ext4_insert_delayed_block - adds a delayed block to the extents status
+  *                             tree, incrementing the reserved cluster/block
+@@ -1887,216 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
+ 	return 0;
+ }
+ 
+-static int __ext4_journalled_writepage(struct page *page,
+-				       unsigned int len)
++static void mpage_page_done(struct mpage_da_data *mpd, struct page *page)
+ {
+-	struct address_space *mapping = page->mapping;
+-	struct inode *inode = mapping->host;
+-	handle_t *handle = NULL;
+-	int ret = 0, err = 0;
+-	int inline_data = ext4_has_inline_data(inode);
+-	struct buffer_head *inode_bh = NULL;
+-	loff_t size;
+-
+-	ClearPageChecked(page);
+-
+-	if (inline_data) {
+-		BUG_ON(page->index != 0);
+-		BUG_ON(len > ext4_get_max_inline_size(inode));
+-		inode_bh = ext4_journalled_write_inline_data(inode, len, page);
+-		if (inode_bh == NULL)
+-			goto out;
+-	}
+-	/*
+-	 * We need to release the page lock before we start the
+-	 * journal, so grab a reference so the page won't disappear
+-	 * out from under us.
+-	 */
+-	get_page(page);
+-	unlock_page(page);
+-
+-	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+-				    ext4_writepage_trans_blocks(inode));
+-	if (IS_ERR(handle)) {
+-		ret = PTR_ERR(handle);
+-		put_page(page);
+-		goto out_no_pagelock;
+-	}
+-	BUG_ON(!ext4_handle_valid(handle));
+-
+-	lock_page(page);
+-	put_page(page);
+-	size = i_size_read(inode);
+-	if (page->mapping != mapping || page_offset(page) > size) {
+-		/* The page got truncated from under us */
+-		ext4_journal_stop(handle);
+-		ret = 0;
+-		goto out;
+-	}
+-
+-	if (inline_data) {
+-		ret = ext4_mark_inode_dirty(handle, inode);
+-	} else {
+-		struct buffer_head *page_bufs = page_buffers(page);
+-
+-		if (page->index == size >> PAGE_SHIFT)
+-			len = size & ~PAGE_MASK;
+-		else
+-			len = PAGE_SIZE;
+-
+-		ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+-					     NULL, do_journal_get_write_access);
+-
+-		err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
+-					     NULL, write_end_fn);
+-	}
+-	if (ret == 0)
+-		ret = err;
+-	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+-	if (ret == 0)
+-		ret = err;
+-	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+-	err = ext4_journal_stop(handle);
+-	if (!ret)
+-		ret = err;
+-
+-	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+-out:
++	mpd->first_page++;
+ 	unlock_page(page);
+-out_no_pagelock:
+-	brelse(inode_bh);
+-	return ret;
+-}
+-
+-/*
+- * Note that we don't need to start a transaction unless we're journaling data
+- * because we should have holes filled from ext4_page_mkwrite(). We even don't
+- * need to file the inode to the transaction's list in ordered mode because if
+- * we are writing back data added by write(), the inode is already there and if
+- * we are writing back data modified via mmap(), no one guarantees in which
+- * transaction the data will hit the disk. In case we are journaling data, we
+- * cannot start transaction directly because transaction start ranks above page
+- * lock so we have to do some magic.
+- *
+- * This function can get called via...
+- *   - ext4_writepages after taking page lock (have journal handle)
+- *   - journal_submit_inode_data_buffers (no journal handle)
+- *   - shrink_page_list via the kswapd/direct reclaim (no journal handle)
+- *   - grab_page_cache when doing write_begin (have journal handle)
+- *
+- * We don't do any block allocation in this function. If we have page with
+- * multiple blocks we need to write those buffer_heads that are mapped. This
+- * is important for mmaped based write. So if we do with blocksize 1K
+- * truncate(f, 1024);
+- * a = mmap(f, 0, 4096);
+- * a[0] = 'a';
+- * truncate(f, 4096);
+- * we have in the page first buffer_head mapped via page_mkwrite call back
+- * but other buffer_heads would be unmapped but dirty (dirty done via the
+- * do_wp_page). So writepage should write the first block. If we modify
+- * the mmap area beyond 1024 we will again get a page_fault and the
+- * page_mkwrite callback will do the block allocation and mark the
+- * buffer_heads mapped.
+- *
+- * We redirty the page if we have any buffer_heads that is either delay or
+- * unwritten in the page.
+- *
+- * We can get recursively called as show below.
+- *
+- *	ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
+- *		ext4_writepage()
+- *
+- * But since we don't do any block allocation we should not deadlock.
+- * Page also have the dirty flag cleared so we don't get recurive page_lock.
+- */
+-static int ext4_writepage(struct page *page,
+-			  struct writeback_control *wbc)
+-{
+-	struct folio *folio = page_folio(page);
+-	int ret = 0;
+-	loff_t size;
+-	unsigned int len;
+-	struct buffer_head *page_bufs = NULL;
+-	struct inode *inode = page->mapping->host;
+-	struct ext4_io_submit io_submit;
+-
+-	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
+-		folio_invalidate(folio, 0, folio_size(folio));
+-		folio_unlock(folio);
+-		return -EIO;
+-	}
+-
+-	trace_ext4_writepage(page);
+-	size = i_size_read(inode);
+-	if (page->index == size >> PAGE_SHIFT &&
+-	    !ext4_verity_in_progress(inode))
+-		len = size & ~PAGE_MASK;
+-	else
+-		len = PAGE_SIZE;
+-
+-	/* Should never happen but for bugs in other kernel subsystems */
+-	if (!page_has_buffers(page)) {
+-		ext4_warning_inode(inode,
+-		   "page %lu does not have buffers attached", page->index);
+-		ClearPageDirty(page);
+-		unlock_page(page);
+-		return 0;
+-	}
+-
+-	page_bufs = page_buffers(page);
+-	/*
+-	 * We cannot do block allocation or other extent handling in this
+-	 * function. If there are buffers needing that, we have to redirty
+-	 * the page. But we may reach here when we do a journal commit via
+-	 * journal_submit_inode_data_buffers() and in that case we must write
+-	 * allocated buffers to achieve data=ordered mode guarantees.
+-	 *
+-	 * Also, if there is only one buffer per page (the fs block
+-	 * size == the page size), if one buffer needs block
+-	 * allocation or needs to modify the extent tree to clear the
+-	 * unwritten flag, we know that the page can't be written at
+-	 * all, so we might as well refuse the write immediately.
+-	 * Unfortunately if the block size != page size, we can't as
+-	 * easily detect this case using ext4_walk_page_buffers(), but
+-	 * for the extremely common case, this is an optimization that
+-	 * skips a useless round trip through ext4_bio_write_page().
+-	 */
+-	if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL,
+-				   ext4_bh_delay_or_unwritten)) {
+-		redirty_page_for_writepage(wbc, page);
+-		if ((current->flags & PF_MEMALLOC) ||
+-		    (inode->i_sb->s_blocksize == PAGE_SIZE)) {
+-			/*
+-			 * For memory cleaning there's no point in writing only
+-			 * some buffers. So just bail out. Warn if we came here
+-			 * from direct reclaim.
+-			 */
+-			WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD))
+-							== PF_MEMALLOC);
+-			unlock_page(page);
+-			return 0;
+-		}
+-	}
+-
+-	if (PageChecked(page) && ext4_should_journal_data(inode))
+-		/*
+-		 * It's mmapped pagecache.  Add buffers and journal it.  There
+-		 * doesn't seem much point in redirtying the page here.
+-		 */
+-		return __ext4_journalled_writepage(page, len);
+-
+-	ext4_io_submit_init(&io_submit, wbc);
+-	io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
+-	if (!io_submit.io_end) {
+-		redirty_page_for_writepage(wbc, page);
+-		unlock_page(page);
+-		return -ENOMEM;
+-	}
+-	ret = ext4_bio_write_page(&io_submit, page, len);
+-	ext4_io_submit(&io_submit);
+-	/* Drop io_end reference we got from init */
+-	ext4_put_io_end_defer(io_submit.io_end);
+-	return ret;
+ }
+ 
+ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+@@ -2129,7 +1899,6 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
+ 	err = ext4_bio_write_page(&mpd->io_submit, page, len);
+ 	if (!err)
+ 		mpd->wbc->nr_to_write--;
+-	mpd->first_page++;
+ 
+ 	return err;
+ }
+@@ -2243,6 +2012,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd,
+ 		err = mpage_submit_page(mpd, head->b_page);
+ 		if (err < 0)
+ 			return err;
++		mpage_page_done(mpd, head->b_page);
+ 	}
+ 	if (lblk >= blocks) {
+ 		mpd->scanned_until_end = 1;
+@@ -2374,6 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
+ 			err = mpage_submit_page(mpd, page);
+ 			if (err < 0)
+ 				goto out;
++			mpage_page_done(mpd, page);
+ 		}
+ 		folio_batch_release(&fbatch);
+ 	}
+@@ -2572,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page)
+ 	return false;
+ }
+ 
++static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
++				     int len)
++{
++	struct buffer_head *page_bufs = page_buffers(page);
++	struct inode *inode = page->mapping->host;
++	int ret, err;
++
++	ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
++				     NULL, do_journal_get_write_access);
++	err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
++				     NULL, write_end_fn);
++	if (ret == 0)
++		ret = err;
++	err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
++	if (ret == 0)
++		ret = err;
++	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
++
++	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
++
++	return ret;
++}
++
++static int mpage_journal_page_buffers(handle_t *handle,
++				      struct mpage_da_data *mpd,
++				      struct page *page)
++{
++	struct inode *inode = mpd->inode;
++	loff_t size = i_size_read(inode);
++	int len;
++
++	ClearPageChecked(page);
++	clear_page_dirty_for_io(page);
++	mpd->wbc->nr_to_write--;
++
++	if (page->index == size >> PAGE_SHIFT &&
++	    !ext4_verity_in_progress(inode))
++		len = size & ~PAGE_MASK;
++	else
++		len = PAGE_SIZE;
++
++	return ext4_journal_page_buffers(handle, page, len);
++}
++
+ /*
+  * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
+  * 				 needing mapping, submit mapped pages
+@@ -2597,7 +2412,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+ 	struct address_space *mapping = mpd->inode->i_mapping;
+ 	struct pagevec pvec;
+ 	unsigned int nr_pages;
+-	long left = mpd->wbc->nr_to_write;
+ 	pgoff_t index = mpd->first_page;
+ 	pgoff_t end = mpd->last_page;
+ 	xa_mark_t tag;
+@@ -2605,12 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+ 	int blkbits = mpd->inode->i_blkbits;
+ 	ext4_lblk_t lblk;
+ 	struct buffer_head *head;
++	handle_t *handle = NULL;
++	int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ 
+ 	if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
+ 		tag = PAGECACHE_TAG_TOWRITE;
+ 	else
+ 		tag = PAGECACHE_TAG_DIRTY;
+ 
++	if (ext4_should_journal_data(mpd->inode)) {
++		handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE,
++					    bpp);
++		if (IS_ERR(handle))
++			return PTR_ERR(handle);
++	}
+ 	pagevec_init(&pvec);
+ 	mpd->map.m_len = 0;
+ 	mpd->next_page = index;
+@@ -2631,13 +2453,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+ 			 * newly appeared dirty pages, but have not synced all
+ 			 * of the old dirty pages.
+ 			 */
+-			if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0)
++			if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
++			    mpd->wbc->nr_to_write <=
++			    mpd->map.m_len >> (PAGE_SHIFT - blkbits))
+ 				goto out;
+ 
+ 			/* If we can't merge this page, we are done. */
+ 			if (mpd->map.m_len > 0 && mpd->next_page != page->index)
+ 				goto out;
+ 
++			if (handle) {
++				err = ext4_journal_ensure_credits(handle, bpp,
++								  0);
++				if (err < 0)
++					goto out;
++			}
++
+ 			lock_page(page);
+ 			/*
+ 			 * If the page is no longer dirty, or its mapping no
+@@ -2677,18 +2508,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+ 				mpd->first_page = page->index;
+ 			mpd->next_page = page->index + 1;
+ 			/*
+-			 * Writeout for transaction commit where we cannot
+-			 * modify metadata is simple. Just submit the page.
++			 * Writeout when we cannot modify metadata is simple.
++			 * Just submit the page. For data=journal mode we
++			 * first handle writeout of the page for checkpoint and
++			 * only after that handle delayed page dirtying. This
++			 * is crutial so that forcing a transaction commit and
++			 * then calling filemap_write_and_wait() guarantees
++			 * current state of data is in its final location. Such
++			 * sequence is used for example by insert/collapse
++			 * range operations before discarding the page cache.
+ 			 */
+ 			if (!mpd->can_map) {
+ 				if (ext4_page_nomap_can_writeout(page)) {
+ 					err = mpage_submit_page(mpd, page);
+ 					if (err < 0)
+ 						goto out;
+-				} else {
+-					unlock_page(page);
+-					mpd->first_page++;
+ 				}
++				/* Pending dirtying of journalled data? */
++				if (PageChecked(page)) {
++					err = mpage_journal_page_buffers(handle,
++						mpd, page);
++					if (err < 0)
++						goto out;
++				}
++				mpage_page_done(mpd, page);
+ 			} else {
+ 				/* Add all dirty buffers to mpd */
+ 				lblk = ((ext4_lblk_t)page->index) <<
+@@ -2700,24 +2543,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
+ 					goto out;
+ 				err = 0;
+ 			}
+-			left--;
+ 		}
+ 		pagevec_release(&pvec);
+ 		cond_resched();
+ 	}
+ 	mpd->scanned_until_end = 1;
++	if (handle)
++		ext4_journal_stop(handle);
+ 	return 0;
+ out:
+ 	pagevec_release(&pvec);
++	if (handle)
++		ext4_journal_stop(handle);
+ 	return err;
+ }
+ 
+-static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc,
+-			     void *data)
+-{
+-	return ext4_writepage(page, wbc);
+-}
+-
+ static int ext4_do_writepages(struct mpage_da_data *mpd)
+ {
+ 	struct writeback_control *wbc = mpd->wbc;
+@@ -2743,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
+ 	if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ 		goto out_writepages;
+ 
+-	if (ext4_should_journal_data(inode)) {
+-		blk_start_plug(&plug);
+-		ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL);
+-		blk_finish_plug(&plug);
+-		goto out_writepages;
+-	}
+-
+ 	/*
+ 	 * If the filesystem has aborted, it is read-only, so return
+ 	 * right away instead of dumping stack traces later on that
+@@ -2784,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd)
+ 		ext4_journal_stop(handle);
+ 	}
+ 
++	/*
++	 * data=journal mode does not do delalloc so we just need to writeout /
++	 * journal already mapped buffers
++	 */
++	if (ext4_should_journal_data(inode))
++		mpd->can_map = 0;
++
+ 	if (ext4_should_dioread_nolock(inode)) {
+ 		/*
+ 		 * We may need to convert up to one extent per block in
+@@ -3160,9 +3000,8 @@ static int ext4_da_write_end(struct file *file,
+ 	 * i_disksize since writeback will push i_disksize upto i_size
+ 	 * eventually. If the end of the current write is > i_size and
+ 	 * inside an allocated block (ext4_da_should_update_i_disksize()
+-	 * check), we need to update i_disksize here as neither
+-	 * ext4_writepage() nor certain ext4_writepages() paths not
+-	 * allocating blocks update i_disksize.
++	 * check), we need to update i_disksize here as certain
++	 * ext4_writepages() paths not allocating blocks update i_disksize.
+ 	 *
+ 	 * Note that we defer inode dirtying to generic_write_end() /
+ 	 * ext4_da_write_inline_data_end().
+@@ -3687,24 +3526,26 @@ const struct iomap_ops ext4_iomap_report_ops = {
+ };
+ 
+ /*
+- * Whenever the folio is being dirtied, corresponding buffers should already
+- * be attached to the transaction (we take care of this in ext4_page_mkwrite()
+- * and ext4_write_begin()). However we cannot move buffers to dirty transaction
+- * lists here because ->dirty_folio is called under VFS locks and the folio
+- * is not necessarily locked.
+- *
+- * We cannot just dirty the folio and leave attached buffers clean, because the
+- * buffers' dirty state is "definitive".  We cannot just set the buffers dirty
+- * or jbddirty because all the journalling code will explode.
+- *
+- * So what we do is to mark the folio "pending dirty" and next time writepage
+- * is called, propagate that into the buffers appropriately.
++ * For data=journal mode, folio should be marked dirty only when it was
++ * writeably mapped. When that happens, it was already attached to the
++ * transaction and marked as jbddirty (we take care of this in
++ * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings
++ * so we should have nothing to do here, except for the case when someone
++ * had the page pinned and dirtied the page through this pin (e.g. by doing
++ * direct IO to it). In that case we'd need to attach buffers here to the
++ * transaction but we cannot due to lock ordering.  We cannot just dirty the
++ * folio and leave attached buffers clean, because the buffers' dirty state is
++ * "definitive".  We cannot just set the buffers dirty or jbddirty because all
++ * the journalling code will explode.  So what we do is to mark the folio
++ * "pending dirty" and next time ext4_writepages() is called, attach buffers
++ * to the transaction appropriately.
+  */
+ static bool ext4_journalled_dirty_folio(struct address_space *mapping,
+ 		struct folio *folio)
+ {
+ 	WARN_ON_ONCE(!folio_buffers(folio));
+-	folio_set_checked(folio);
++	if (folio_maybe_dma_pinned(folio))
++		folio_set_checked(folio);
+ 	return filemap_dirty_folio(mapping, folio);
+ }
+ 
+@@ -4872,13 +4713,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ 		goto bad_inode;
+ 	raw_inode = ext4_raw_inode(&iloc);
+ 
+-	if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
+-		ext4_error_inode(inode, function, line, 0,
+-				 "iget: root inode unallocated");
+-		ret = -EFSCORRUPTED;
+-		goto bad_inode;
+-	}
+-
+ 	if ((flags & EXT4_IGET_HANDLE) &&
+ 	    (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
+ 		ret = -ESTALE;
+@@ -4951,11 +4785,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
+ 	 * NeilBrown 1999oct15
+ 	 */
+ 	if (inode->i_nlink == 0) {
+-		if ((inode->i_mode == 0 ||
++		if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
+ 		     !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
+ 		    ino != EXT4_BOOT_LOADER_INO) {
+-			/* this inode is deleted */
+-			ret = -ESTALE;
++			/* this inode is deleted or unallocated */
++			if (flags & EXT4_IGET_SPECIAL) {
++				ext4_error_inode(inode, function, line, 0,
++						 "iget: special inode unallocated");
++				ret = -EFSCORRUPTED;
++			} else
++				ret = -ESTALE;
+ 			goto bad_inode;
+ 		}
+ 		/* The only unlinked inodes we let through here have
+@@ -5382,7 +5221,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
+ 	 * If the folio is fully truncated, we don't need to wait for any commit
+ 	 * (and we even should not as __ext4_journalled_invalidate_folio() may
+ 	 * strip all buffers from the folio but keep the folio dirty which can then
+-	 * confuse e.g. concurrent ext4_writepage() seeing dirty folio without
++	 * confuse e.g. concurrent ext4_writepages() seeing dirty folio without
+ 	 * buffers). Also we don't need to wait for any commit if all buffers in
+ 	 * the folio remain valid. This is most beneficial for the common case of
+ 	 * blocksize == PAGESIZE.
+@@ -5788,7 +5627,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ 	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+ 	int gdpblocks;
+ 	int idxblocks;
+-	int ret = 0;
++	int ret;
+ 
+ 	/*
+ 	 * How many index blocks need to touch to map @lblocks logical blocks
+@@ -6320,18 +6159,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
+ 		err = __block_write_begin(page, 0, len, ext4_get_block);
+ 		if (!err) {
+ 			ret = VM_FAULT_SIGBUS;
+-			if (ext4_walk_page_buffers(handle, inode,
+-					page_buffers(page), 0, len, NULL,
+-					do_journal_get_write_access))
+-				goto out_error;
+-			if (ext4_walk_page_buffers(handle, inode,
+-					page_buffers(page), 0, len, NULL,
+-					write_end_fn))
+-				goto out_error;
+-			if (ext4_jbd2_inode_add_write(handle, inode,
+-						      page_offset(page), len))
++			if (ext4_journal_page_buffers(handle, page, len))
+ 				goto out_error;
+-			ext4_set_inode_state(inode, EXT4_STATE_JDATA);
+ 		} else {
+ 			unlock_page(page);
+ 		}
+diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
+index 8067ccda34e4..2e8c34036313 100644
+--- a/fs/ext4/ioctl.c
++++ b/fs/ext4/ioctl.c
+@@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb,
+ 	set_buffer_uptodate(bh);
+ 	unlock_buffer(bh);
+ 
+-	if (err)
+-		goto out_bh;
+-
+ 	if (handle) {
+ 		err = ext4_handle_dirty_metadata(handle, NULL, bh);
+ 		if (err)
+diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
+index dd28453d6ea3..270fbcba75b6 100644
+--- a/fs/ext4/namei.c
++++ b/fs/ext4/namei.c
+@@ -3872,9 +3872,16 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ 			if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
+ 				goto end_rename;
+ 		}
++		/*
++		 * We need to protect against old.inode directory getting
++		 * converted from inline directory format into a normal one.
++		 */
++		inode_lock_nested(old.inode, I_MUTEX_NONDIR2);
+ 		retval = ext4_rename_dir_prepare(handle, &old);
+-		if (retval)
++		if (retval) {
++			inode_unlock(old.inode);
+ 			goto end_rename;
++		}
+ 	}
+ 	/*
+ 	 * If we're renaming a file within an inline_data dir and adding or
+@@ -4006,6 +4013,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ 	} else {
+ 		ext4_journal_stop(handle);
+ 	}
++	if (old.dir_bh)
++		inode_unlock(old.inode);
+ release_bh:
+ 	brelse(old.dir_bh);
+ 	brelse(old.bh);
+diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
+index beaec6d81074..3bc7c7c5b99d 100644
+--- a/fs/ext4/page-io.c
++++ b/fs/ext4/page-io.c
+@@ -500,7 +500,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
+ 
+ 	/* Nothing to submit? Just unlock the page... */
+ 	if (!nr_to_submit)
+-		goto unlock;
++		return 0;
+ 
+ 	bh = head = page_buffers(page);
+ 
+@@ -548,7 +548,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
+ 				}
+ 				bh = bh->b_this_page;
+ 			} while (bh != head);
+-			goto unlock;
++
++			return ret;
+ 		}
+ 	}
+ 
+@@ -564,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
+ 		io_submit_add_bh(io, inode,
+ 				 bounce_page ? bounce_page : page, bh);
+ 	} while ((bh = bh->b_this_page) != head);
+-unlock:
+-	unlock_page(page);
+-	return ret;
++
++	return 0;
+ }
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index c81fa0fa9901..2192b4111442 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -4751,7 +4751,6 @@ static int ext4_group_desc_init(struct super_block *sb,
+ 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+ 	unsigned int db_count;
+ 	ext4_fsblk_t block;
+-	int ret;
+ 	int i;
+ 
+ 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
+@@ -4791,8 +4790,7 @@ static int ext4_group_desc_init(struct super_block *sb,
+ 			ext4_msg(sb, KERN_ERR,
+ 			       "can't read group descriptor %d", i);
+ 			sbi->s_gdb_count = i;
+-			ret = PTR_ERR(bh);
+-			goto out;
++			return PTR_ERR(bh);
+ 		}
+ 		rcu_read_lock();
+ 		rcu_dereference(sbi->s_group_desc)[i] = bh;
+@@ -4801,13 +4799,10 @@ static int ext4_group_desc_init(struct super_block *sb,
+ 	sbi->s_gdb_count = db_count;
+ 	if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) {
+ 		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+-		ret = -EFSCORRUPTED;
+-		goto out;
++		return -EFSCORRUPTED;
+ 	}
++
+ 	return 0;
+-out:
+-	ext4_group_desc_free(sbi);
+-	return ret;
+ }
+ 
+ static int ext4_load_and_init_journal(struct super_block *sb,
+@@ -5234,14 +5229,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
+ 	if (ext4_geometry_check(sb, es))
+ 		goto failed_mount;
+ 
+-	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
+-	if (err)
+-		goto failed_mount;
+-
+ 	timer_setup(&sbi->s_err_report, print_daily_error_info, 0);
+ 	spin_lock_init(&sbi->s_error_lock);
+ 	INIT_WORK(&sbi->s_error_work, flush_stashed_error_work);
+ 
++	err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed);
++	if (err)
++		goto failed_mount3;
++
+ 	/* Register extent status tree shrinker */
+ 	if (ext4_es_register_shrinker(sbi))
+ 		goto failed_mount3;
+@@ -5967,8 +5962,11 @@ static int ext4_load_journal(struct super_block *sb,
+ 	if (!really_read_only && journal_devnum &&
+ 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
+ 		es->s_journal_dev = cpu_to_le32(journal_devnum);
+-
+-		/* Make sure we flush the recovery flag to disk. */
++		ext4_commit_super(sb);
++	}
++	if (!really_read_only && journal_inum &&
++	    journal_inum != le32_to_cpu(es->s_journal_inum)) {
++		es->s_journal_inum = cpu_to_le32(journal_inum);
+ 		ext4_commit_super(sb);
+ 	}
+ 
+diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
+index 0c6b011a91b3..62f2ec599218 100644
+--- a/fs/ext4/xattr.c
++++ b/fs/ext4/xattr.c
+@@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index)
+ }
+ 
+ static int
+-ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
+-			 void *value_start)
++check_xattrs(struct inode *inode, struct buffer_head *bh,
++	     struct ext4_xattr_entry *entry, void *end, void *value_start,
++	     const char *function, unsigned int line)
+ {
+ 	struct ext4_xattr_entry *e = entry;
++	int err = -EFSCORRUPTED;
++	char *err_str;
++
++	if (bh) {
++		if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
++		    BHDR(bh)->h_blocks != cpu_to_le32(1)) {
++			err_str = "invalid header";
++			goto errout;
++		}
++		if (buffer_verified(bh))
++			return 0;
++		if (!ext4_xattr_block_csum_verify(inode, bh)) {
++			err = -EFSBADCRC;
++			err_str = "invalid checksum";
++			goto errout;
++		}
++	} else {
++		struct ext4_xattr_ibody_header *header = value_start;
++
++		header -= 1;
++		if (end - (void *)header < sizeof(*header) + sizeof(u32)) {
++			err_str = "in-inode xattr block too small";
++			goto errout;
++		}
++		if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) {
++			err_str = "bad magic number in in-inode xattr";
++			goto errout;
++		}
++	}
+ 
+ 	/* Find the end of the names list */
+ 	while (!IS_LAST_ENTRY(e)) {
+ 		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
+-		if ((void *)next >= end)
+-			return -EFSCORRUPTED;
+-		if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
+-			return -EFSCORRUPTED;
++		if ((void *)next >= end) {
++			err_str = "e_name out of bounds";
++			goto errout;
++		}
++		if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) {
++			err_str = "bad e_name length";
++			goto errout;
++		}
+ 		e = next;
+ 	}
+ 
+ 	/* Check the values */
+ 	while (!IS_LAST_ENTRY(entry)) {
+ 		u32 size = le32_to_cpu(entry->e_value_size);
++		unsigned long ea_ino = le32_to_cpu(entry->e_value_inum);
+ 
+-		if (size > EXT4_XATTR_SIZE_MAX)
+-			return -EFSCORRUPTED;
++		if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) {
++			err_str = "ea_inode specified without ea_inode feature enabled";
++			goto errout;
++		}
++		if (ea_ino && ((ea_ino == EXT4_ROOT_INO) ||
++			       !ext4_valid_inum(inode->i_sb, ea_ino))) {
++			err_str = "invalid ea_ino";
++			goto errout;
++		}
++		if (size > EXT4_XATTR_SIZE_MAX) {
++			err_str = "e_value size too large";
++			goto errout;
++		}
+ 
+ 		if (size != 0 && entry->e_value_inum == 0) {
+ 			u16 offs = le16_to_cpu(entry->e_value_offs);
+@@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
+ 			 * the padded and unpadded sizes, since the size may
+ 			 * overflow to 0 when adding padding.
+ 			 */
+-			if (offs > end - value_start)
+-				return -EFSCORRUPTED;
++			if (offs > end - value_start) {
++				err_str = "e_value out of bounds";
++				goto errout;
++			}
+ 			value = value_start + offs;
+ 			if (value < (void *)e + sizeof(u32) ||
+ 			    size > end - value ||
+-			    EXT4_XATTR_SIZE(size) > end - value)
+-				return -EFSCORRUPTED;
++			    EXT4_XATTR_SIZE(size) > end - value) {
++				err_str = "overlapping e_value ";
++				goto errout;
++			}
+ 		}
+ 		entry = EXT4_XATTR_NEXT(entry);
+ 	}
+-
++	if (bh)
++		set_buffer_verified(bh);
+ 	return 0;
++
++errout:
++	if (bh)
++		__ext4_error_inode(inode, function, line, 0, -err,
++				   "corrupted xattr block %llu: %s",
++				   (unsigned long long) bh->b_blocknr,
++				   err_str);
++	else
++		__ext4_error_inode(inode, function, line, 0, -err,
++				   "corrupted in-inode xattr: %s", err_str);
++	return err;
+ }
+ 
+ static inline int
+ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh,
+ 			 const char *function, unsigned int line)
+ {
+-	int error = -EFSCORRUPTED;
+-
+-	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
+-	    BHDR(bh)->h_blocks != cpu_to_le32(1))
+-		goto errout;
+-	if (buffer_verified(bh))
+-		return 0;
+-
+-	error = -EFSBADCRC;
+-	if (!ext4_xattr_block_csum_verify(inode, bh))
+-		goto errout;
+-	error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size,
+-					 bh->b_data);
+-errout:
+-	if (error)
+-		__ext4_error_inode(inode, function, line, 0, -error,
+-				   "corrupted xattr block %llu",
+-				   (unsigned long long) bh->b_blocknr);
+-	else
+-		set_buffer_verified(bh);
+-	return error;
++	return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size,
++			    bh->b_data, function, line);
+ }
+ 
+ #define ext4_xattr_check_block(inode, bh) \
+ 	__ext4_xattr_check_block((inode), (bh),  __func__, __LINE__)
+ 
+ 
+-static int
++static inline int
+ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
+ 			 void *end, const char *function, unsigned int line)
+ {
+-	int error = -EFSCORRUPTED;
+-
+-	if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
+-	    (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
+-		goto errout;
+-	error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header));
+-errout:
+-	if (error)
+-		__ext4_error_inode(inode, function, line, 0, -error,
+-				   "corrupted in-inode xattr");
+-	return error;
++	return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header),
++			    function, line);
+ }
+ 
+ #define xattr_check_inode(inode, header, end) \
+@@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
+ 	struct inode *inode;
+ 	int err;
+ 
++	/*
++	 * We have to check for this corruption early as otherwise
++	 * iget_locked() could wait indefinitely for the state of our
++	 * parent inode.
++	 */
++	if (parent->i_ino == ea_ino) {
++		ext4_error(parent->i_sb,
++			   "Parent and EA inode have the same ino %lu", ea_ino);
++		return -EFSCORRUPTED;
++	}
++
+ 	inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
+ 	if (IS_ERR(inode)) {
+ 		err = PTR_ERR(inode);
 diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
 index e7537fd305dd..e191ecfb1fde 100644
 --- a/fs/gfs2/bmap.c
@@ -23847,6 +25585,18 @@ index d5130d1fcfae..011b50469301 100644
  		if (error == -EFSCORRUPTED)
  			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
  					extp, sizeof(*extp));
+diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
+index 88a88506ffff..92ca2017eded 100644
+--- a/fs/xfs/xfs_fsmap.c
++++ b/fs/xfs/xfs_fsmap.c
+@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt(
+ {
+ 	struct xfs_alloc_rec_incore	akeys[2];
+ 
++	memset(akeys, 0, sizeof(akeys));
+ 	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+ 	return __xfs_getfsmap_datadev(tp, keys, info,
+ 			xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
 diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
 index 4d0a98f920ca..9edc1f2bc939 100644
 --- a/fs/xfs/xfs_globals.c
@@ -24891,13 +26641,31 @@ index 6548b5b5aa60..75d7d22c3a27 100644
  );
  
  TRACE_EVENT(btrfs_find_cluster,
+diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
+index 77b426ae0064..ebccf6a6aa1b 100644
+--- a/include/trace/events/ext4.h
++++ b/include/trace/events/ext4.h
+@@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op,
+ 		  (unsigned long) __entry->index)
+ );
+ 
+-DEFINE_EVENT(ext4__page_op, ext4_writepage,
+-
+-	TP_PROTO(struct page *page),
+-
+-	TP_ARGS(page)
+-);
+-
+ DEFINE_EVENT(ext4__page_op, ext4_readpage,
+ 
+ 	TP_PROTO(struct page *page),
 -- 
-2.39.2
+2.40.0.rc2
 
-From dd48d0cbb7162c029af11d861336a07195a7f331 Mon Sep 17 00:00:00 2001
+From 31bc464783789781c2a6885b36f63fcb3751a5bb Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 17 Feb 2023 15:35:46 +0100
-Subject: [PATCH 08/15] Implement amd-pstate-epp and amd-pstate-guided driver
+Date: Fri, 10 Mar 2023 18:05:48 +0100
+Subject: [PATCH 08/16] Implement amd-pstate-epp and amd-pstate-guided driver
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -24907,14 +26675,13 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  drivers/cpufreq/amd-pstate.c                  | 794 +++++++++++++++++-
  drivers/cpufreq/brcmstb-avs-cpufreq.c         |   5 +-
  drivers/cpufreq/cpufreq.c                     |   8 +-
- drivers/cpufreq/davinci-cpufreq.c             |   4 +-
  drivers/cpufreq/mediatek-cpufreq-hw.c         |   4 +-
  drivers/cpufreq/omap-cpufreq.c                |   4 +-
  drivers/cpufreq/qcom-cpufreq-hw.c             |   4 +-
  include/acpi/cppc_acpi.h                      |  23 +
  include/linux/amd-pstate.h                    |  34 +
  include/linux/cpufreq.h                       |   2 +-
- 13 files changed, 1139 insertions(+), 59 deletions(-)
+ 12 files changed, 1136 insertions(+), 58 deletions(-)
 
 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
 index 9595abf34974..f39b8f05392c 100644
@@ -25335,7 +27102,7 @@ index 0f17b1c32718..0efdbeed6ada 100644
  			/* Update only if there are pending write commands */
  			if (pcc_ss_data->pending_pcc_write_cmd)
 diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
-index c17bd845f5fc..d4e60da7a544 100644
+index c17bd845f5fc..f4f96baae500 100644
 --- a/drivers/cpufreq/amd-pstate.c
 +++ b/drivers/cpufreq/amd-pstate.c
 @@ -59,8 +59,173 @@
@@ -26189,7 +27956,7 @@ index c17bd845f5fc..d4e60da7a544 100644
 -	if (!cppc_load) {
 -		pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n");
 +	if (cppc_state == AMD_PSTATE_DISABLE) {
-+		pr_debug("driver load is disabled, boot with specific mode to enable this\n");
++		pr_info("driver load is disabled, boot with specific mode to enable this\n");
  		return -ENODEV;
  	}
  
@@ -26322,21 +28089,6 @@ index 7e56a42750ea..85a0bea2dbf1 100644
  }
  EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
  
-diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c
-index 9e97f60f8199..2d23015e2abd 100644
---- a/drivers/cpufreq/davinci-cpufreq.c
-+++ b/drivers/cpufreq/davinci-cpufreq.c
-@@ -138,7 +138,9 @@ static int __exit davinci_cpufreq_remove(struct platform_device *pdev)
- 	if (cpufreq.asyncclk)
- 		clk_put(cpufreq.asyncclk);
- 
--	return cpufreq_unregister_driver(&davinci_driver);
-+	cpufreq_unregister_driver(&davinci_driver);
-+
-+	return 0;
- }
- 
- static struct platform_driver davinci_cpufreq_driver = {
 diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c
 index f80339779084..f21a9e3df53d 100644
 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c
@@ -26509,12 +28261,12 @@ index 6a94a6eaad27..65623233ab2f 100644
  bool cpufreq_driver_test_flags(u16 flags);
  const char *cpufreq_get_current_driver(void);
 -- 
-2.39.2
+2.40.0.rc2
 
-From 952e829ec925dcded44f080eefbef8078de089c8 Mon Sep 17 00:00:00 2001
+From 501028b1bc1da95eeb61b26a0ee82ef93873d5d7 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Sun, 22 Jan 2023 13:41:50 +0100
-Subject: [PATCH 09/15] ksm
+Subject: [PATCH 09/16] ksm
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -26774,10 +28526,10 @@ index 860b2dcf3ac4..810e1fcaff94 100644
  COND_SYSCALL(mbind);
  COND_SYSCALL(get_mempolicy);
 diff --git a/mm/ksm.c b/mm/ksm.c
-index c267b92b837b..4474b7ac0cd6 100644
+index ee60890cf9b1..bc920121bce9 100644
 --- a/mm/ksm.c
 +++ b/mm/ksm.c
-@@ -2575,54 +2575,78 @@ static int ksm_scan_thread(void *nothing)
+@@ -2582,54 +2582,78 @@ static int ksm_scan_thread(void *nothing)
  	return 0;
  }
  
@@ -27010,37 +28762,37 @@ index b6ea204d4e23..0064dcafb812 100644
 +subsys_initcall(pmadv_sysfs_init);
 +#endif /* CONFIG_KSM */
 -- 
-2.39.2
+2.40.0.rc2
 
-From 4146b9df71595a233386acaed0dc699b27eb7e8a Mon Sep 17 00:00:00 2001
+From abf71738a315ea5ad029cd3976ec7b2d9456c432 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Mon, 13 Feb 2023 09:25:47 +0100
-Subject: [PATCH 10/15] maple-lru
+Date: Fri, 10 Mar 2023 18:06:12 +0100
+Subject: [PATCH 10/16] maple-lru
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- Documentation/mm/multigen_lru.rst |   86 ++-
+ Documentation/mm/multigen_lru.rst |  128 +++-
  include/linux/fs.h                |    2 +
  include/linux/maple_tree.h        |    6 -
  include/linux/memcontrol.h        |   10 +
  include/linux/mm_inline.h         |   19 +-
- include/linux/mmzone.h            |  122 +++-
- lib/maple_tree.c                  |  113 ++-
+ include/linux/mmzone.h            |  124 +++-
+ lib/maple_tree.c                  |  149 ++--
  mm/fadvise.c                      |    5 +-
  mm/memcontrol.c                   |   12 +
  mm/memory.c                       |    7 +-
  mm/page_alloc.c                   |    1 +
  mm/rmap.c                         |   42 +-
- mm/vmscan.c                       | 1059 ++++++++++++++++++-----------
+ mm/vmscan.c                       | 1083 ++++++++++++++++++-----------
  mm/workingset.c                   |    4 +-
  tools/testing/radix-tree/maple.c  |   18 +-
- 15 files changed, 1002 insertions(+), 504 deletions(-)
+ 15 files changed, 1066 insertions(+), 544 deletions(-)
 
 diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst
-index d7062c6a8946..5f1f6ecbb79b 100644
+index d7062c6a8946..52ed5092022f 100644
 --- a/Documentation/mm/multigen_lru.rst
 +++ b/Documentation/mm/multigen_lru.rst
-@@ -89,15 +89,15 @@ variables are monotonically increasing.
+@@ -89,21 +89,22 @@ variables are monotonically increasing.
  
  Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)``
  bits in order to fit into the gen counter in ``folio->flags``. Each
@@ -27059,7 +28811,15 @@ index d7062c6a8946..5f1f6ecbb79b 100644
  contrast to moving across generations, which requires the LRU lock,
  moving across tiers only involves atomic operations on
  ``folio->flags`` and therefore has a negligible cost. A feedback loop
-@@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
+ modeled after the PID controller monitors refaults over all the tiers
+ from anon and file types and decides which tiers from which types to
+-evict or protect.
++evict or protect. The desired effect is to balance refault percentages
++between anon and file types proportional to the swappiness level.
+ 
+ There are two conceptually independent procedures: the aging and the
+ eviction. They form a closed-loop system, i.e., the page reclaim.
+@@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``.
  Eviction
  --------
  The eviction consumes old generations. Given an ``lruvec``, it
@@ -27068,7 +28828,7 @@ index d7062c6a8946..5f1f6ecbb79b 100644
  ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to
  evict from, it first compares ``min_seq[]`` to select the older type.
  If both types are equally old, it selects the one whose first tier has
-@@ -141,9 +141,85 @@ loop has detected outlying refaults from the tier this page is in. To
+@@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To
  this end, the feedback loop uses the first tier as the baseline, for
  the reason stated earlier.
  
@@ -27087,6 +28847,27 @@ index d7062c6a8946..5f1f6ecbb79b 100644
 +   and memory sizes.
 +2. It is more reliable because it is directly wired to the OOM killer.
 +
++``mm_struct`` list
++------------------
++An ``mm_struct`` list is maintained for each memcg, and an
++``mm_struct`` follows its owner task to the new memcg when this task
++is migrated.
++
++A page table walker iterates ``lruvec_memcg()->mm_list`` and calls
++``walk_page_range()`` with each ``mm_struct`` on this list to scan
++PTEs. When multiple page table walkers iterate the same list, each of
++them gets a unique ``mm_struct``, and therefore they can run in
++parallel.
++
++Page table walkers ignore any misplaced pages, e.g., if an
++``mm_struct`` was migrated, pages left in the previous memcg will be
++ignored when the current memcg is under reclaim. Similarly, page table
++walkers will ignore pages from nodes other than the one under reclaim.
++
++This infrastructure also tracks the usage of ``mm_struct`` between
++context switches so that page table walkers can skip processes that
++have been sleeping since the last iteration.
++
 +Rmap/PT walk feedback
 +---------------------
 +Searching the rmap for PTEs mapping each page on an LRU list (to test
@@ -27101,7 +28882,7 @@ index d7062c6a8946..5f1f6ecbb79b 100644
 +adds the PMD entry pointing to the PTE table to the Bloom filter. This
 +forms a feedback loop between the eviction and the aging.
 +
-+Bloom Filters
++Bloom filters
 +-------------
 +Bloom filters are a space and memory efficient data structure for set
 +membership test, i.e., test if an element is not in the set or may be
@@ -27117,6 +28898,18 @@ index d7062c6a8946..5f1f6ecbb79b 100644
 +which may yield hot pages anyway. Parameters of the filter itself can
 +control the false positive rate in the limit.
 +
++PID controller
++--------------
++A feedback loop modeled after the Proportional-Integral-Derivative
++(PID) controller monitors refaults over anon and file types and
++decides which type to evict when both types are available from the
++same generation.
++
++The PID controller uses generations rather than the wall clock as the
++time domain because a CPU can scan pages at different rates under
++varying memory pressure. It calculates a moving average for each new
++generation to avoid being permanently locked in a suboptimal state.
++
 +Memcg LRU
 +---------
 +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs,
@@ -27155,6 +28948,15 @@ index d7062c6a8946..5f1f6ecbb79b 100644
  
  * Generations
  * Rmap walks
+-* Page table walks
+-* Bloom filters
+-* PID controller
++* Page table walks via ``mm_struct`` list
++* Bloom filters for rmap/PT walk feedback
++* PID controller for refault feedback
+ 
+ The aging and the eviction form a producer-consumer model;
+ specifically, the latter drives the former by the sliding window over
 diff --git a/include/linux/fs.h b/include/linux/fs.h
 index c1769a2c5d70..d353c262d669 100644
 --- a/include/linux/fs.h
@@ -27285,7 +29087,7 @@ index ff3f3f23f649..de1e622dd366 100644
 +
  #endif
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index cd28a100d9e4..977be526c939 100644
+index cd28a100d9e4..70bd7f55bdd2 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
 @@ -7,6 +7,7 @@
@@ -27490,20 +29292,32 @@ index cd28a100d9e4..977be526c939 100644
  	/* to concurrently iterate lru_gen_mm_list */
  	struct lru_gen_mm_state		mm_state;
  #endif
-@@ -1243,6 +1345,8 @@ typedef struct pglist_data {
+@@ -1242,7 +1344,9 @@ typedef struct pglist_data {
+ 
  #ifdef CONFIG_LRU_GEN
  	/* kswap mm walk data */
- 	struct lru_gen_mm_walk	mm_walk;
+-	struct lru_gen_mm_walk	mm_walk;
++	struct lru_gen_mm_walk mm_walk;
 +	/* lru_gen_folio list */
 +	struct lru_gen_memcg memcg_lru;
  #endif
  
  	CACHELINE_PADDING(_pad2_);
 diff --git a/lib/maple_tree.c b/lib/maple_tree.c
-index 5a976393c9ae..b95652b79b55 100644
+index 5a976393c9ae..a73f83d0eb0e 100644
 --- a/lib/maple_tree.c
 +++ b/lib/maple_tree.c
-@@ -149,13 +149,12 @@ struct maple_subtree_state {
+@@ -146,16 +146,22 @@ struct maple_subtree_state {
+ 	struct maple_big_node *bn;
+ };
+ 
++#ifdef CONFIG_KASAN_STACK
++/* Prevent mas_wr_bnode() from exceeding the stack frame limit */
++#define noinline_for_kasan noinline_for_stack
++#else
++#define noinline_for_kasan inline
++#endif
++
  /* Functions */
  static inline struct maple_node *mt_alloc_one(gfp_t gfp)
  {
@@ -27519,7 +29333,7 @@ index 5a976393c9ae..b95652b79b55 100644
  }
  
  static inline void mt_free_bulk(size_t size, void __rcu **nodes)
-@@ -183,7 +182,6 @@ static void ma_free_rcu(struct maple_node *node)
+@@ -183,7 +189,6 @@ static void ma_free_rcu(struct maple_node *node)
  	call_rcu(&node->rcu, mt_free_rcu);
  }
  
@@ -27527,7 +29341,7 @@ index 5a976393c9ae..b95652b79b55 100644
  static void mas_set_height(struct ma_state *mas)
  {
  	unsigned int new_flags = mas->tree->ma_flags;
-@@ -468,7 +466,7 @@ static inline
+@@ -468,7 +473,7 @@ static inline
  void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
  		    unsigned char slot)
  {
@@ -27536,7 +29350,7 @@ index 5a976393c9ae..b95652b79b55 100644
  	unsigned long shift;
  	unsigned long type;
  	enum maple_type p_type = mte_node_type(parent);
-@@ -502,10 +500,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
+@@ -502,10 +507,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent,
   */
  static inline unsigned int mte_parent_slot(const struct maple_enode *enode)
  {
@@ -27549,7 +29363,7 @@ index 5a976393c9ae..b95652b79b55 100644
  		return 0;
  
  	/*
-@@ -1128,9 +1125,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
+@@ -1128,9 +1132,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
  {
  	struct maple_alloc *ret, *node = mas->alloc;
  	unsigned long total = mas_allocated(mas);
@@ -27561,7 +29375,7 @@ index 5a976393c9ae..b95652b79b55 100644
  		return NULL;
  
  	if (total == 1) {
-@@ -1140,27 +1138,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
+@@ -1140,27 +1145,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas)
  		goto single_node;
  	}
  
@@ -27597,7 +29411,7 @@ index 5a976393c9ae..b95652b79b55 100644
  	return (struct maple_node *)ret;
  }
  
-@@ -1179,21 +1175,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
+@@ -1179,21 +1182,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used)
  	unsigned long count;
  	unsigned int requested = mas_alloc_req(mas);
  
@@ -27624,7 +29438,7 @@ index 5a976393c9ae..b95652b79b55 100644
  		reuse->total += head->total;
  	}
  
-@@ -1212,7 +1207,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+@@ -1212,7 +1214,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
  {
  	struct maple_alloc *node;
  	unsigned long allocated = mas_allocated(mas);
@@ -27632,7 +29446,7 @@ index 5a976393c9ae..b95652b79b55 100644
  	unsigned int requested = mas_alloc_req(mas);
  	unsigned int count;
  	void **slots = NULL;
-@@ -1228,24 +1222,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+@@ -1228,24 +1229,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
  		WARN_ON(!allocated);
  	}
  
@@ -27667,7 +29481,7 @@ index 5a976393c9ae..b95652b79b55 100644
  
  			slots = (void **)&node->slot[offset];
  			max_req -= offset;
-@@ -1259,15 +1258,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+@@ -1259,15 +1265,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
  			goto nomem_bulk;
  
  		node->node_count += count;
@@ -27687,7 +29501,7 @@ index 5a976393c9ae..b95652b79b55 100644
  	return;
  
  nomem_bulk:
-@@ -1276,10 +1273,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
+@@ -1276,10 +1280,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp)
  nomem_one:
  	mas_set_alloc_req(mas, requested);
  	if (mas->alloc && !(((unsigned long)mas->alloc & 0x1)))
@@ -27699,7 +29513,7 @@ index 5a976393c9ae..b95652b79b55 100644
  }
  
  /*
-@@ -1887,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas,
+@@ -1887,10 +1889,9 @@ static inline int mab_calc_split(struct ma_state *mas,
  
  	/* Avoid ending a node on a NULL entry */
  	split = mab_no_null_split(bn, split, slot_count);
@@ -27712,7 +29526,16 @@ index 5a976393c9ae..b95652b79b55 100644
  
  	return split;
  }
-@@ -2947,7 +2941,7 @@ static inline void *mtree_range_walk(struct ma_state *mas)
+@@ -2113,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end,
+  *
+  * Return: The actual end of the data stored in @b_node
+  */
+-static inline void mas_store_b_node(struct ma_wr_state *wr_mas,
++static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas,
+ 		struct maple_big_node *b_node, unsigned char offset_end)
+ {
+ 	unsigned char slot;
+@@ -2947,7 +2948,7 @@ static inline void *mtree_range_walk(struct ma_state *mas)
  	mas->min = prev_min;
  	mas->max = prev_max;
  	mas->node = last;
@@ -27721,7 +29544,7 @@ index 5a976393c9ae..b95652b79b55 100644
  
  dead_node:
  	mas_reset(mas);
-@@ -3467,7 +3461,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
+@@ -3467,7 +3468,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height,
   */
  static int mas_split(struct ma_state *mas, struct maple_big_node *b_node)
  {
@@ -27729,7 +29552,16 @@ index 5a976393c9ae..b95652b79b55 100644
  	struct maple_subtree_state mast;
  	int height = 0;
  	unsigned char mid_split, split = 0;
-@@ -3893,7 +3886,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
+@@ -3586,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas,
+  * @b_node: The maple big node
+  * @end: The end of the data.
+  */
+-static inline int mas_commit_b_node(struct ma_wr_state *wr_mas,
++static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas,
+ 			    struct maple_big_node *b_node, unsigned char end)
+ {
+ 	struct maple_node *node;
+@@ -3893,7 +3893,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
  			goto dead_node;
  	} while (!ma_is_leaf(type));
  
@@ -27738,7 +29570,7 @@ index 5a976393c9ae..b95652b79b55 100644
  
  dead_node:
  	mas_reset(mas);
-@@ -4711,15 +4704,11 @@ static inline void *mas_next_nentry(struct ma_state *mas,
+@@ -4711,15 +4711,11 @@ static inline void *mas_next_nentry(struct ma_state *mas,
  
  static inline void mas_rewalk(struct ma_state *mas, unsigned long index)
  {
@@ -27754,7 +29586,48 @@ index 5a976393c9ae..b95652b79b55 100644
  }
  
  /*
-@@ -5590,8 +5579,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
+@@ -5093,35 +5089,21 @@ static inline bool mas_rewind_node(struct ma_state *mas)
+  */
+ static inline bool mas_skip_node(struct ma_state *mas)
+ {
+-	unsigned char slot, slot_count;
+-	unsigned long *pivots;
+-	enum maple_type mt;
++	if (mas_is_err(mas))
++		return false;
+ 
+-	mt = mte_node_type(mas->node);
+-	slot_count = mt_slots[mt] - 1;
+ 	do {
+ 		if (mte_is_root(mas->node)) {
+-			slot = mas->offset;
+-			if (slot > slot_count) {
++			if (mas->offset >= mas_data_end(mas)) {
+ 				mas_set_err(mas, -EBUSY);
+ 				return false;
+ 			}
+ 		} else {
+ 			mas_ascend(mas);
+-			slot = mas->offset;
+-			mt = mte_node_type(mas->node);
+-			slot_count = mt_slots[mt] - 1;
+ 		}
+-	} while (slot > slot_count);
+-
+-	mas->offset = ++slot;
+-	pivots = ma_pivots(mas_mn(mas), mt);
+-	if (slot > 0)
+-		mas->min = pivots[slot - 1] + 1;
+-
+-	if (slot <= slot_count)
+-		mas->max = pivots[slot];
++	} while (mas->offset >= mas_data_end(mas));
+ 
++	mas->offset++;
+ 	return true;
+ }
+ 
+@@ -5590,8 +5572,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
  
  /*
   * mte_destroy_walk() - Free a tree or sub-tree.
@@ -27765,7 +29638,7 @@ index 5a976393c9ae..b95652b79b55 100644
   *
   * Must hold the write lock.
   */
-@@ -5620,7 +5609,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
+@@ -5620,7 +5602,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas)
  				mas_reset(wr_mas->mas);
  		}
  	}
@@ -27773,7 +29646,15 @@ index 5a976393c9ae..b95652b79b55 100644
  }
  
  /* Interface */
-@@ -5745,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
+@@ -5733,6 +5714,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
+ 	mas_reset(mas);
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(mas_preallocate);
+ 
+ /*
+  * mas_destroy() - destroy a maple state.
+@@ -5745,6 +5727,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp)
  void mas_destroy(struct ma_state *mas)
  {
  	struct maple_alloc *node;
@@ -27781,7 +29662,7 @@ index 5a976393c9ae..b95652b79b55 100644
  
  	/*
  	 * When using mas_for_each() to insert an expected number of elements,
-@@ -5767,14 +5756,20 @@ void mas_destroy(struct ma_state *mas)
+@@ -5767,14 +5750,20 @@ void mas_destroy(struct ma_state *mas)
  	}
  	mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC);
  
@@ -27806,7 +29687,7 @@ index 5a976393c9ae..b95652b79b55 100644
  	mas->alloc = NULL;
  }
  EXPORT_SYMBOL_GPL(mas_destroy);
-@@ -6734,7 +6729,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
+@@ -6734,7 +6723,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry,
  
  		if (i < (MAPLE_RANGE64_SLOTS - 1))
  			last = node->pivot[i];
@@ -27815,7 +29696,7 @@ index 5a976393c9ae..b95652b79b55 100644
  			break;
  		if (last == 0 && i > 0)
  			break;
-@@ -6841,7 +6836,7 @@ void mt_dump(const struct maple_tree *mt)
+@@ -6841,7 +6830,7 @@ void mt_dump(const struct maple_tree *mt)
  	if (!xa_is_node(entry))
  		mt_dump_entry(entry, 0, 0, 0);
  	else if (entry)
@@ -27848,7 +29729,7 @@ index bf04fec87f35..fb7c5f43fd2a 100644
  	case POSIX_FADV_DONTNEED:
  		__filemap_fdatawrite_range(mapping, offset, endbyte,
 diff --git a/mm/memcontrol.c b/mm/memcontrol.c
-index 73afff8062f9..7fe2f4f36cf4 100644
+index 2eee092f8f11..802d3868d097 100644
 --- a/mm/memcontrol.c
 +++ b/mm/memcontrol.c
 @@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
@@ -27874,7 +29755,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644
  	if (order > 0)
  		return 0;
  
-@@ -5382,6 +5391,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
+@@ -5386,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
  	if (unlikely(mem_cgroup_is_root(memcg)))
  		queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
  				   2UL*HZ);
@@ -27882,7 +29763,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644
  	return 0;
  offline_kmem:
  	memcg_offline_kmem(memcg);
-@@ -5413,6 +5423,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
+@@ -5417,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
  	memcg_offline_kmem(memcg);
  	reparent_shrinker_deferred(memcg);
  	wb_memcg_offline(memcg);
@@ -27890,7 +29771,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644
  
  	drain_all_stock(memcg);
  
-@@ -5424,6 +5435,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
+@@ -5428,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
  	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
  	invalidate_reclaim_iterators(memcg);
@@ -27924,10 +29805,10 @@ index f526b9152bef..4ad62eba3cb7 100644
  
  static void lru_gen_exit_fault(void)
 diff --git a/mm/page_alloc.c b/mm/page_alloc.c
-index 3bb3484563ed..d8c7750c5c92 100644
+index 3aec9a6a9cb7..6658cbf43f5d 100644
 --- a/mm/page_alloc.c
 +++ b/mm/page_alloc.c
-@@ -7929,6 +7929,7 @@ static void __init free_area_init_node(int nid)
+@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid)
  	pgdat_set_deferred_range(pgdat);
  
  	free_area_init_core(pgdat);
@@ -27936,7 +29817,7 @@ index 3bb3484563ed..d8c7750c5c92 100644
  
  static void __init free_area_init_memoryless_node(int nid)
 diff --git a/mm/rmap.c b/mm/rmap.c
-index b616870a09be..7b9205cb7d87 100644
+index 3b45d049069e..c8701608bb0d 100644
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
 @@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio,
@@ -28015,7 +29896,7 @@ index b616870a09be..7b9205cb7d87 100644
  	*vm_flags = pra.vm_flags;
  
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 160acbbdf111..04a54656b6b7 100644
+index 160acbbdf111..1a8f3b1c0bad 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -55,6 +55,8 @@
@@ -28285,6 +30166,15 @@ index 160acbbdf111..04a54656b6b7 100644
  static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
  {
  	int i;
+@@ -3592,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
+ }
+ 
+ /******************************************************************************
+- *                          refault feedback loop
++ *                          PID controller
+  ******************************************************************************/
+ 
+ /*
 @@ -3623,7 +3639,7 @@ struct ctrl_pos {
  static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
  			  struct ctrl_pos *pos)
@@ -29558,7 +31448,69 @@ index 160acbbdf111..04a54656b6b7 100644
  
  		while (!list_empty(head)) {
  			bool success;
-@@ -5545,7 +5814,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
+@@ -5402,14 +5671,14 @@ static void lru_gen_change_state(bool enabled)
+  *                          sysfs interface
+  ******************************************************************************/
+ 
+-static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+ {
+-	return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
++	return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
+ }
+ 
+ /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+-static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+-			     const char *buf, size_t len)
++static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr,
++				const char *buf, size_t len)
+ {
+ 	unsigned int msecs;
+ 
+@@ -5421,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
+ 	return len;
+ }
+ 
+-static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
+-	min_ttl_ms, 0644, show_min_ttl, store_min_ttl
+-);
++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms);
+ 
+-static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
++static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+ {
+ 	unsigned int caps = 0;
+ 
+@@ -5442,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
+ }
+ 
+ /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
+-static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
++static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr,
+ 			     const char *buf, size_t len)
+ {
+ 	int i;
+@@ -5469,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
+ 	return len;
+ }
+ 
+-static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+-	enabled, 0644, show_enabled, store_enabled
+-);
++static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled);
+ 
+ static struct attribute *lru_gen_attrs[] = {
+ 	&lru_gen_min_ttl_attr.attr,
+@@ -5479,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = {
+ 	NULL
+ };
+ 
+-static struct attribute_group lru_gen_attr_group = {
++static const struct attribute_group lru_gen_attr_group = {
+ 	.name = "lru_gen",
+ 	.attrs = lru_gen_attrs,
+ };
+@@ -5545,7 +5810,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
  	int i;
  	int type, tier;
  	int hist = lru_hist_from_seq(seq);
@@ -29567,7 +31519,7 @@ index 160acbbdf111..04a54656b6b7 100644
  
  	for (tier = 0; tier < MAX_NR_TIERS; tier++) {
  		seq_printf(m, "            %10d", tier);
-@@ -5595,7 +5864,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
+@@ -5595,7 +5860,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v)
  	unsigned long seq;
  	bool full = !debugfs_real_fops(m->file)->write;
  	struct lruvec *lruvec = v;
@@ -29576,7 +31528,7 @@ index 160acbbdf111..04a54656b6b7 100644
  	int nid = lruvec_pgdat(lruvec)->node_id;
  	struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  	DEFINE_MAX_SEQ(lruvec);
-@@ -5692,7 +5961,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
+@@ -5692,7 +5957,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co
  		if (sc->nr_reclaimed >= nr_to_reclaim)
  			return 0;
  
@@ -29585,7 +31537,7 @@ index 160acbbdf111..04a54656b6b7 100644
  			return 0;
  
  		cond_resched();
-@@ -5713,11 +5982,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
+@@ -5713,11 +5978,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
  
  	if (!mem_cgroup_disabled()) {
  		rcu_read_lock();
@@ -29600,7 +31552,7 @@ index 160acbbdf111..04a54656b6b7 100644
  		rcu_read_unlock();
  
  		if (!memcg)
-@@ -5777,7 +6046,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
+@@ -5777,7 +6042,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
  	set_task_reclaim_state(current, &sc.reclaim_state);
  	flags = memalloc_noreclaim_save();
  	blk_start_plug(&plug);
@@ -29609,7 +31561,7 @@ index 160acbbdf111..04a54656b6b7 100644
  		err = -ENOMEM;
  		goto done;
  	}
-@@ -5849,7 +6118,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+@@ -5849,7 +6114,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
  {
  	int i;
  	int gen, type, zone;
@@ -29618,7 +31570,7 @@ index 160acbbdf111..04a54656b6b7 100644
  
  	lrugen->max_seq = MIN_NR_GENS + 1;
  	lrugen->enabled = lru_gen_enabled();
-@@ -5858,13 +6127,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
+@@ -5858,13 +6123,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
  		lrugen->timestamps[i] = jiffies;
  
  	for_each_gen_type_zone(gen, type, zone)
@@ -29646,7 +31598,7 @@ index 160acbbdf111..04a54656b6b7 100644
  void lru_gen_init_memcg(struct mem_cgroup *memcg)
  {
  	INIT_LIST_HEAD(&memcg->mm_list.fifo);
-@@ -5876,19 +6158,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
+@@ -5876,19 +6154,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  	int i;
  	int nid;
  
@@ -29673,7 +31625,7 @@ index 160acbbdf111..04a54656b6b7 100644
  
  static int __init init_lru_gen(void)
  {
-@@ -5915,6 +6203,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
+@@ -5915,6 +6199,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
  {
  }
  
@@ -29684,7 +31636,7 @@ index 160acbbdf111..04a54656b6b7 100644
  #endif /* CONFIG_LRU_GEN */
  
  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
-@@ -5928,7 +6220,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+@@ -5928,7 +6216,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  	bool proportional_reclaim;
  	struct blk_plug plug;
  
@@ -29693,7 +31645,7 @@ index 160acbbdf111..04a54656b6b7 100644
  		lru_gen_shrink_lruvec(lruvec, sc);
  		return;
  	}
-@@ -6171,6 +6463,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
+@@ -6171,6 +6459,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  	struct lruvec *target_lruvec;
  	bool reclaimable = false;
  
@@ -29786,12 +31738,12 @@ index 81fa7ec2e66a..1f36bc1c5d36 100644
  	mn = mas_pop_node(&mas);
  	MT_BUG_ON(mt, not_empty(mn));
 -- 
-2.39.2
+2.40.0.rc2
 
-From bdbf1daa5eee87e0879e18f3a427259ff1840c98 Mon Sep 17 00:00:00 2001
+From d9e434e1093f450c71f9a327b2201f7bdcc75743 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Fri, 17 Feb 2023 13:41:20 +0100
-Subject: [PATCH 11/15] mm/kvm: lockless accessed bit harvest
+Subject: [PATCH 11/16] mm/kvm: lockless accessed bit harvest
 
 TLDR
 ====
@@ -30687,7 +32639,7 @@ index d6c06e140277..521f71ad0467 100644
  					  unsigned long address)
  {
 diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
-index 977be526c939..beece92ce62e 100644
+index 70bd7f55bdd2..0ddbf712708d 100644
 --- a/include/linux/mmzone.h
 +++ b/include/linux/mmzone.h
 @@ -379,6 +379,7 @@ enum {
@@ -30756,7 +32708,7 @@ index f45ff1b7626a..324799848fed 100644
  			      unsigned long address)
  {
 diff --git a/mm/rmap.c b/mm/rmap.c
-index 7b9205cb7d87..82e3a0be1ada 100644
+index c8701608bb0d..8ecbbadab752 100644
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
 @@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio,
@@ -30776,7 +32728,7 @@ index 7b9205cb7d87..82e3a0be1ada 100644
  						pvmw.pte))
  				referenced++;
 diff --git a/mm/vmscan.c b/mm/vmscan.c
-index 04a54656b6b7..2fc436638dfe 100644
+index 1a8f3b1c0bad..ec0142165ce7 100644
 --- a/mm/vmscan.c
 +++ b/mm/vmscan.c
 @@ -57,6 +57,8 @@
@@ -31011,7 +32963,7 @@ index 04a54656b6b7..2fc436638dfe 100644
  }
  
  /******************************************************************************
-@@ -5707,6 +5805,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
+@@ -5705,6 +5803,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c
  	if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
  		caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
  
@@ -31022,7 +32974,7 @@ index 04a54656b6b7..2fc436638dfe 100644
  }
  
 diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
-index 9c60384b5ae0..1b465df4a93d 100644
+index 07aae60288f9..a115a27b375e 100644
 --- a/virt/kvm/kvm_main.c
 +++ b/virt/kvm/kvm_main.c
 @@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
@@ -31098,12 +33050,12 @@ index 9c60384b5ae0..1b465df4a93d 100644
  	.release		= kvm_mmu_notifier_release,
  };
 -- 
-2.39.2
+2.40.0.rc2
 
-From df63c6ca5ad19cda15524ce1f5fce0eed3dc9932 Mon Sep 17 00:00:00 2001
+From c63e61e48ac0d492af1918ba84350e07a5c95d17 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 13 Feb 2023 09:26:09 +0100
-Subject: [PATCH 12/15] objtool
+Subject: [PATCH 12/16] objtool
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -31112,13 +33064,13 @@ Signed-off-by: Peter Jung <admin@ptr1337.dev>
  tools/objtool/Documentation/objtool.txt |  8 +++
  tools/objtool/Makefile                  | 66 +++++++++++++++++--------
  tools/objtool/builtin-check.c           |  2 +-
- tools/objtool/check.c                   |  9 ++++
+ tools/objtool/check.c                   |  7 +++
  tools/objtool/elf.c                     | 42 ++++++++--------
  tools/objtool/include/objtool/builtin.h |  2 -
  tools/objtool/include/objtool/elf.h     |  9 ++--
  tools/objtool/include/objtool/special.h |  2 +-
  tools/objtool/special.c                 |  6 +--
- 11 files changed, 95 insertions(+), 54 deletions(-)
+ 11 files changed, 93 insertions(+), 54 deletions(-)
 
 diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore
 index 14236db3677f..4faa4dd72f35 100644
@@ -31283,26 +33235,10 @@ index a4f39407bf59..7c175198d09f 100644
  	OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks),
  	OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"),
 diff --git a/tools/objtool/check.c b/tools/objtool/check.c
-index 4b7c8b33069e..0678ba04fe22 100644
+index ea1e7cdeb1b3..384b7df3fbb2 100644
 --- a/tools/objtool/check.c
 +++ b/tools/objtool/check.c
-@@ -688,6 +688,7 @@ static int create_static_call_sections(struct objtool_file *file)
- 		if (strncmp(key_name, STATIC_CALL_TRAMP_PREFIX_STR,
- 			    STATIC_CALL_TRAMP_PREFIX_LEN)) {
- 			WARN("static_call: trampoline name malformed: %s", key_name);
-+			free(key_name);
- 			return -1;
- 		}
- 		tmp = key_name + STATIC_CALL_TRAMP_PREFIX_LEN - STATIC_CALL_KEY_PREFIX_LEN;
-@@ -697,6 +698,7 @@ static int create_static_call_sections(struct objtool_file *file)
- 		if (!key_sym) {
- 			if (!opts.module) {
- 				WARN("static_call: can't find static_call_key symbol: %s", tmp);
-+				free(key_name);
- 				return -1;
- 			}
- 
-@@ -854,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
+@@ -856,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file)
  	list_for_each_entry(insn, &file->endbr_list, call_node) {
  
  		int *site = (int *)sec->data->d_buf + idx;
@@ -31504,12 +33440,12 @@ index 9c8d827f69af..baa85c31526b 100644
  	unsigned int nr_entries;
  	struct special_alt *alt;
 -- 
-2.39.2
+2.40.0.rc2
 
-From ebd62c969d7faaafed390dca325a64c1b7cbd982 Mon Sep 17 00:00:00 2001
+From 56bbff019101b84507c1e796512b1be6840c6eda Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Sat, 25 Feb 2023 14:41:53 +0100
-Subject: [PATCH 13/15] sched
+Date: Fri, 3 Mar 2023 17:02:07 +0100
+Subject: [PATCH 13/16] sched
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -33156,12 +35092,12 @@ index 771f8ddb7053..9e8bb6278604 100644
   * enum cpu_util_type - CPU utilization type
   * @FREQUENCY_UTIL:	Utilization used to select frequency
 -- 
-2.39.2
+2.40.0.rc2
 
-From 8b6571aa2c4ecca1ed8686c872fb37696788a043 Mon Sep 17 00:00:00 2001
+From e0cfd01287f19367a61351b05d43cf4471156ffd Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Mon, 6 Feb 2023 09:53:13 +0100
-Subject: [PATCH 14/15] zram
+Subject: [PATCH 14/16] zram
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -33649,12 +35585,12 @@ index c5254626f051..2afdbf76a1aa 100644
  };
  #endif
 -- 
-2.39.2
+2.40.0.rc2
 
-From 33909ccdec1819a9090548ad25426b2ba315de15 Mon Sep 17 00:00:00 2001
+From 02b507dfef3f09d3de2785ed80164e15c8ed7844 Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
 Date: Tue, 14 Feb 2023 22:02:09 +0100
-Subject: [PATCH 15/15] zstd import v1.5.4
+Subject: [PATCH 15/16] zstd import v1.5.4
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
@@ -45767,4 +47703,3518 @@ index f4ed952ed485..7d31518e9d5a 100644
  EXPORT_SYMBOL(zstd_reset_dstream);
  
 -- 
-2.39.2
+2.40.0.rc2
+
+From 16b77e5461b5cc96bf4476bde0fee2ecc25aca83 Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Fri, 10 Mar 2023 19:28:54 +0100
+Subject: [PATCH 16/16] v4l2-core: add v4l2loopback
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ drivers/media/v4l2-core/Kconfig               |    5 +
+ drivers/media/v4l2-core/Makefile              |    2 +
+ drivers/media/v4l2-core/v4l2loopback.c        | 2906 +++++++++++++++++
+ drivers/media/v4l2-core/v4l2loopback.h        |   96 +
+ .../media/v4l2-core/v4l2loopback_formats.h    |  445 +++
+ 5 files changed, 3454 insertions(+)
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback.c
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback.h
+ create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h
+
+diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig
+index 348559bc2468..32a46fcc751f 100644
+--- a/drivers/media/v4l2-core/Kconfig
++++ b/drivers/media/v4l2-core/Kconfig
+@@ -40,6 +40,11 @@ config VIDEO_TUNER
+ config V4L2_JPEG_HELPER
+ 	tristate
+ 
++config V4L2_LOOPBACK
++	tristate "V4L2 loopback device"
++	help
++	  V4L2 loopback device
++
+ # Used by drivers that need v4l2-h264.ko
+ config V4L2_H264
+ 	tristate
+diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile
+index 41d91bd10cf2..4de37a844f95 100644
+--- a/drivers/media/v4l2-core/Makefile
++++ b/drivers/media/v4l2-core/Makefile
+@@ -32,6 +32,8 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o
+ obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o
+ obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o
+ 
++obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o
++
+ obj-$(CONFIG_VIDEOBUF_DMA_CONTIG) += videobuf-dma-contig.o
+ obj-$(CONFIG_VIDEOBUF_DMA_SG) += videobuf-dma-sg.o
+ obj-$(CONFIG_VIDEOBUF_GEN) += videobuf-core.o
+diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c
+new file mode 100644
+index 000000000000..2ab1f760cfb5
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback.c
+@@ -0,0 +1,2906 @@
++/* -*- c-file-style: "linux" -*- */
++/*
++ * v4l2loopback.c  --  video4linux2 loopback driver
++ *
++ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com)
++ * Copyright (C) 2010-2019 IOhannes m zmoelnig (zmoelnig@iem.at)
++ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de)
++ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com)
++ *
++ * This program is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ */
++#include <linux/version.h>
++#include <linux/vmalloc.h>
++#include <linux/mm.h>
++#include <linux/time.h>
++#include <linux/module.h>
++#include <linux/videodev2.h>
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/fs.h>
++#include <linux/capability.h>
++#include <linux/eventpoll.h>
++#include <media/v4l2-ioctl.h>
++#include <media/v4l2-common.h>
++#include <media/v4l2-device.h>
++#include <media/v4l2-ctrls.h>
++#include <media/v4l2-event.h>
++
++#include <linux/miscdevice.h>
++#include "v4l2loopback.h"
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1)
++#define kstrtoul strict_strtoul
++#endif
++
++#if defined(timer_setup) && defined(from_timer)
++#define HAVE_TIMER_SETUP
++#endif
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0)
++#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER
++#endif
++
++#define V4L2LOOPBACK_VERSION_CODE                                              \
++	KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \
++		       V4L2LOOPBACK_VERSION_BUGFIX)
++
++MODULE_DESCRIPTION("V4L2 loopback video device");
++MODULE_AUTHOR("Vasily Levin, "
++	      "IOhannes m zmoelnig <zmoelnig@iem.at>,"
++	      "Stefan Diewald,"
++	      "Anton Novikov"
++	      "et al.");
++#ifdef SNAPSHOT_VERSION
++MODULE_VERSION(__stringify(SNAPSHOT_VERSION));
++#else
++MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify(
++	V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX));
++#endif
++MODULE_LICENSE("GPL");
++
++/*
++ * helpers
++ */
++#define dprintk(fmt, args...)                                          \
++	do {                                                           \
++		if (debug > 0) {                                       \
++			printk(KERN_INFO "v4l2-loopback[" __stringify( \
++				       __LINE__) "], pid(%d):  " fmt,  \
++			       task_pid_nr(current), ##args);          \
++		}                                                      \
++	} while (0)
++
++#define MARK()                                                             \
++	do {                                                               \
++		if (debug > 1) {                                           \
++			printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \
++			       __LINE__, __func__, task_pid_nr(current));  \
++		}                                                          \
++	} while (0)
++
++#define dprintkrw(fmt, args...)                                        \
++	do {                                                           \
++		if (debug > 2) {                                       \
++			printk(KERN_INFO "v4l2-loopback[" __stringify( \
++				       __LINE__) "], pid(%d): " fmt,   \
++			       task_pid_nr(current), ##args);          \
++		}                                                      \
++	} while (0)
++
++/* TODO: Make sure that function is never interrupted. */
++static inline int mod_inc(int *number, int mod)
++{
++	int result;
++	result = (*number + 1) % mod;
++	if (unlikely(result < 0))
++		result += mod;
++	*number = result;
++	return result;
++}
++
++static inline void v4l2l_get_timestamp(struct v4l2_buffer *b)
++{
++	/* ktime_get_ts is considered deprecated, so use ktime_get_ts64 if possible */
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0)
++	struct timespec ts;
++	ktime_get_ts(&ts);
++#else
++	struct timespec64 ts;
++	ktime_get_ts64(&ts);
++#endif
++
++	b->timestamp.tv_sec = ts.tv_sec;
++	b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC);
++}
++
++#if !defined(__poll_t)
++typedef unsigned __poll_t;
++#endif
++
++/* module constants
++ *  can be overridden during he build process using something like
++ *	make KCPPFLAGS="-DMAX_DEVICES=100"
++ */
++
++/* maximum number of v4l2loopback devices that can be created */
++#ifndef MAX_DEVICES
++#define MAX_DEVICES 8
++#endif
++
++/* whether the default is to announce capabilities exclusively or not */
++#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
++#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0
++#endif
++
++/* when a producer is considered to have gone stale */
++#ifndef MAX_TIMEOUT
++#define MAX_TIMEOUT (100 * 1000) /* in msecs */
++#endif
++
++/* max buffers that can be mapped, actually they
++ * are all mapped to max_buffers buffers */
++#ifndef MAX_BUFFERS
++#define MAX_BUFFERS 32
++#endif
++
++/* module parameters */
++static int debug = 0;
++module_param(debug, int, S_IRUGO | S_IWUSR);
++MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)");
++
++#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2
++static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS;
++module_param(max_buffers, int, S_IRUGO);
++MODULE_PARM_DESC(max_buffers,
++		 "how many buffers should be allocated [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]");
++
++/* how many times a device can be opened
++ * the per-module default value can be overridden on a per-device basis using
++ * the /sys/devices interface
++ *
++ * note that max_openers should be at least 2 in order to get a working system:
++ *   one opener for the producer and one opener for the consumer
++ *   however, we leave that to the user
++ */
++#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10
++static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS;
++module_param(max_openers, int, S_IRUGO | S_IWUSR);
++MODULE_PARM_DESC(
++	max_openers,
++	"how many users can open the loopback device [DEFAULT: " __stringify(
++		V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]");
++
++static int devices = -1;
++module_param(devices, int, 0);
++MODULE_PARM_DESC(devices, "how many devices should be created");
++
++static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 };
++module_param_array(video_nr, int, NULL, 0444);
++MODULE_PARM_DESC(video_nr,
++		 "video device numbers (-1=auto, 0=/dev/video0, etc.)");
++
++static char *card_label[MAX_DEVICES];
++module_param_array(card_label, charp, NULL, 0000);
++MODULE_PARM_DESC(card_label, "card labels for each device");
++
++static bool exclusive_caps[MAX_DEVICES] = {
++	[0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS
++};
++module_param_array(exclusive_caps, bool, NULL, 0444);
++/* FIXXME: wording */
++MODULE_PARM_DESC(
++	exclusive_caps,
++	"whether to announce OUTPUT/CAPTURE capabilities exclusively or not  [DEFAULT: " __stringify(
++		V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]");
++
++/* format specifications */
++#define V4L2LOOPBACK_SIZE_MIN_WIDTH 48
++#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 32
++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192
++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192
++
++#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640
++#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480
++
++static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
++module_param(max_width, int, S_IRUGO);
++MODULE_PARM_DESC(max_width,
++		 "maximum allowed frame width [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]");
++static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
++module_param(max_height, int, S_IRUGO);
++MODULE_PARM_DESC(max_height,
++		 "maximum allowed frame height [DEFAULT: " __stringify(
++			 V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]");
++
++static DEFINE_IDR(v4l2loopback_index_idr);
++static DEFINE_MUTEX(v4l2loopback_ctl_mutex);
++
++/* frame intervals */
++#define V4L2LOOPBACK_FPS_MIN 0
++#define V4L2LOOPBACK_FPS_MAX 1000
++
++/* control IDs */
++#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000)
++#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0)
++#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1)
++#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2)
++#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3)
++
++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl);
++static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = {
++	.s_ctrl = v4l2loopback_s_ctrl,
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_KEEP_FORMAT,
++	.name	= "keep_format",
++	.type	= V4L2_CTRL_TYPE_BOOLEAN,
++	.min	= 0,
++	.max	= 1,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_SUSTAIN_FRAMERATE,
++	.name	= "sustain_framerate",
++	.type	= V4L2_CTRL_TYPE_BOOLEAN,
++	.min	= 0,
++	.max	= 1,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_TIMEOUT,
++	.name	= "timeout",
++	.type	= V4L2_CTRL_TYPE_INTEGER,
++	.min	= 0,
++	.max	= MAX_TIMEOUT,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = {
++	// clang-format off
++	.ops	= &v4l2loopback_ctrl_ops,
++	.id	= CID_TIMEOUT_IMAGE_IO,
++	.name	= "timeout_image_io",
++	.type	= V4L2_CTRL_TYPE_BOOLEAN,
++	.min	= 0,
++	.max	= 1,
++	.step	= 1,
++	.def	= 0,
++	// clang-format on
++};
++
++/* module structures */
++struct v4l2loopback_private {
++	int device_nr;
++};
++
++/* TODO(vasaka) use typenames which are common to kernel, but first find out if
++ * it is needed */
++/* struct keeping state and settings of loopback device */
++
++struct v4l2l_buffer {
++	struct v4l2_buffer buffer;
++	struct list_head list_head;
++	int use_count;
++};
++
++struct v4l2_loopback_device {
++	struct v4l2_device v4l2_dev;
++	struct v4l2_ctrl_handler ctrl_handler;
++	struct video_device *vdev;
++	/* pixel and stream format */
++	struct v4l2_pix_format pix_format;
++	struct v4l2_captureparm capture_param;
++	unsigned long frame_jiffies;
++
++	/* ctrls */
++	int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all
++			    openers close() the device */
++	int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain
++				  (close to) nominal framerate */
++
++	/* buffers stuff */
++	u8 *image; /* pointer to actual buffers data */
++	unsigned long int imagesize; /* size of buffers data */
++	int buffers_number; /* should not be big, 4 is a good choice */
++	struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */
++	int used_buffers; /* number of the actually used buffers */
++	int max_openers; /* how many times can this device be opened */
++
++	int write_position; /* number of last written frame + 1 */
++	struct list_head outbufs_list; /* buffers in output DQBUF order */
++	int bufpos2index
++		[MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers)
++                        * to inner buffer index */
++	long buffer_size;
++
++	/* sustain_framerate stuff */
++	struct timer_list sustain_timer;
++	unsigned int reread_count;
++
++	/* timeout stuff */
++	unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */
++	int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will
++			       * read/write to timeout_image */
++	u8 *timeout_image; /* copy of it will be captured when timeout passes */
++	struct v4l2l_buffer timeout_image_buffer;
++	struct timer_list timeout_timer;
++	int timeout_happened;
++
++	/* sync stuff */
++	atomic_t open_count;
++
++	int ready_for_capture; /* set to the number of writers that opened the
++                                * device and negotiated format. */
++	int ready_for_output; /* set to true when no writer is currently attached
++			       * this differs slightly from !ready_for_capture,
++			       * e.g. when using fallback images */
++	int active_readers; /* increase if any reader starts streaming */
++	int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE)
++                                * should only be announced if the resp. "ready"
++                                * flag is set; default=TRUE */
++
++	int max_width;
++	int max_height;
++
++	char card_label[32];
++
++	wait_queue_head_t read_event;
++	spinlock_t lock;
++};
++
++/* types of opener shows what opener wants to do with loopback */
++enum opener_type {
++	// clang-format off
++	UNNEGOTIATED	= 0,
++	READER		= 1,
++	WRITER		= 2,
++	// clang-format on
++};
++
++/* struct keeping state and type of opener */
++struct v4l2_loopback_opener {
++	enum opener_type type;
++	int read_position; /* number of last processed frame + 1 or
++			    * write_position - 1 if reader went out of sync */
++	unsigned int reread_count;
++	struct v4l2_buffer *buffers;
++	int buffers_number; /* should not be big, 4 is a good choice */
++	int timeout_image_io;
++
++	struct v4l2_fh fh;
++};
++
++#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh)
++
++/* this is heavily inspired by the bttv driver found in the linux kernel */
++struct v4l2l_format {
++	char *name;
++	int fourcc; /* video4linux 2 */
++	int depth; /* bit/pixel */
++	int flags;
++};
++/* set the v4l2l_format.flags to PLANAR for non-packed formats */
++#define FORMAT_FLAGS_PLANAR 0x01
++#define FORMAT_FLAGS_COMPRESSED 0x02
++
++#include "v4l2loopback_formats.h"
++
++static const unsigned int FORMATS = ARRAY_SIZE(formats);
++
++static char *fourcc2str(unsigned int fourcc, char buf[4])
++{
++	buf[0] = (fourcc >> 0) & 0xFF;
++	buf[1] = (fourcc >> 8) & 0xFF;
++	buf[2] = (fourcc >> 16) & 0xFF;
++	buf[3] = (fourcc >> 24) & 0xFF;
++
++	return buf;
++}
++
++static const struct v4l2l_format *format_by_fourcc(int fourcc)
++{
++	unsigned int i;
++
++	for (i = 0; i < FORMATS; i++) {
++		if (formats[i].fourcc == fourcc)
++			return formats + i;
++	}
++
++	dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF,
++		(fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF,
++		(fourcc >> 24) & 0xFF);
++	return NULL;
++}
++
++static void pix_format_set_size(struct v4l2_pix_format *f,
++				const struct v4l2l_format *fmt,
++				unsigned int width, unsigned int height)
++{
++	f->width = width;
++	f->height = height;
++
++	if (fmt->flags & FORMAT_FLAGS_PLANAR) {
++		f->bytesperline = width; /* Y plane */
++		f->sizeimage = (width * height * fmt->depth) >> 3;
++	} else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) {
++		/* doesn't make sense for compressed formats */
++		f->bytesperline = 0;
++		f->sizeimage = (width * height * fmt->depth) >> 3;
++	} else {
++		f->bytesperline = (width * fmt->depth) >> 3;
++		f->sizeimage = height * f->bytesperline;
++	}
++}
++
++static int set_timeperframe(struct v4l2_loopback_device *dev,
++			    struct v4l2_fract *tpf)
++{
++	if ((tpf->denominator < 1) || (tpf->numerator < 1)) {
++		return -EINVAL;
++	}
++	dev->capture_param.timeperframe = *tpf;
++	dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator /
++					      tpf->denominator);
++	return 0;
++}
++
++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd);
++
++/* device attributes */
++/* available via sysfs: /sys/devices/virtual/video4linux/video* */
++
++static ssize_t attr_show_format(struct device *cd,
++				struct device_attribute *attr, char *buf)
++{
++	/* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++	const struct v4l2_fract *tpf;
++	char buf4cc[5], buf_fps[32];
++
++	if (!dev || !dev->ready_for_capture)
++		return 0;
++	tpf = &dev->capture_param.timeperframe;
++
++	fourcc2str(dev->pix_format.pixelformat, buf4cc);
++	buf4cc[4] = 0;
++	if (tpf->numerator == 1)
++		snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator);
++	else
++		snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator,
++			 tpf->numerator);
++	return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width,
++		       dev->pix_format.height, buf_fps);
++}
++
++static ssize_t attr_store_format(struct device *cd,
++				 struct device_attribute *attr, const char *buf,
++				 size_t len)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++	int fps_num = 0, fps_den = 1;
++
++	if (!dev)
++		return -ENODEV;
++
++	/* only fps changing is supported */
++	if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) {
++		struct v4l2_fract f = { .numerator = fps_den,
++					.denominator = fps_num };
++		int err = 0;
++		if ((err = set_timeperframe(dev, &f)) < 0)
++			return err;
++		return len;
++	}
++	return -EINVAL;
++}
++
++static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format,
++		   attr_store_format);
++
++static ssize_t attr_show_buffers(struct device *cd,
++				 struct device_attribute *attr, char *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++
++	if (!dev)
++		return -ENODEV;
++
++	return sprintf(buf, "%d\n", dev->used_buffers);
++}
++
++static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL);
++
++static ssize_t attr_show_maxopeners(struct device *cd,
++				    struct device_attribute *attr, char *buf)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd);
++
++	if (!dev)
++		return -ENODEV;
++
++	return sprintf(buf, "%d\n", dev->max_openers);
++}
++
++static ssize_t attr_store_maxopeners(struct device *cd,
++				     struct device_attribute *attr,
++				     const char *buf, size_t len)
++{
++	struct v4l2_loopback_device *dev = NULL;
++	unsigned long curr = 0;
++
++	if (kstrtoul(buf, 0, &curr))
++		return -EINVAL;
++
++	dev = v4l2loopback_cd2dev(cd);
++	if (!dev)
++		return -ENODEV;
++
++	if (dev->max_openers == curr)
++		return len;
++
++	if (curr > __INT_MAX__ || dev->open_count.counter > curr) {
++		/* request to limit to less openers as are currently attached to us */
++		return -EINVAL;
++	}
++
++	dev->max_openers = (int)curr;
++
++	return len;
++}
++
++static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners,
++		   attr_store_maxopeners);
++
++static void v4l2loopback_remove_sysfs(struct video_device *vdev)
++{
++#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x)
++
++	if (vdev) {
++		V4L2_SYSFS_DESTROY(format);
++		V4L2_SYSFS_DESTROY(buffers);
++		V4L2_SYSFS_DESTROY(max_openers);
++		/* ... */
++	}
++}
++
++static void v4l2loopback_create_sysfs(struct video_device *vdev)
++{
++	int res = 0;
++
++#define V4L2_SYSFS_CREATE(x)                                 \
++	res = device_create_file(&vdev->dev, &dev_attr_##x); \
++	if (res < 0)                                         \
++	break
++	if (!vdev)
++		return;
++	do {
++		V4L2_SYSFS_CREATE(format);
++		V4L2_SYSFS_CREATE(buffers);
++		V4L2_SYSFS_CREATE(max_openers);
++		/* ... */
++	} while (0);
++
++	if (res >= 0)
++		return;
++	dev_err(&vdev->dev, "%s error: %d\n", __func__, res);
++}
++
++/* Event APIs */
++
++#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START)
++#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000
++#define V4L2_EVENT_PRI_CLIENT_USAGE \
++	(V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1)
++
++struct v4l2_event_client_usage {
++	__u32 count;
++};
++
++/* global module data */
++/* find a device based on it's device-number (e.g. '3' for /dev/video3) */
++struct v4l2loopback_lookup_cb_data {
++	int device_nr;
++	struct v4l2_loopback_device *device;
++};
++static int v4l2loopback_lookup_cb(int id, void *ptr, void *data)
++{
++	struct v4l2_loopback_device *device = ptr;
++	struct v4l2loopback_lookup_cb_data *cbdata = data;
++	if (cbdata && device && device->vdev) {
++		if (device->vdev->num == cbdata->device_nr) {
++			cbdata->device = device;
++			cbdata->device_nr = id;
++			return 1;
++		}
++	}
++	return 0;
++}
++static int v4l2loopback_lookup(int device_nr,
++			       struct v4l2_loopback_device **device)
++{
++	struct v4l2loopback_lookup_cb_data data = {
++		.device_nr = device_nr,
++		.device = NULL,
++	};
++	int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb,
++			       &data);
++	if (1 == err) {
++		if (device)
++			*device = data.device;
++		return data.device_nr;
++	}
++	return -ENODEV;
++}
++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd)
++{
++	struct video_device *loopdev = to_video_device(cd);
++	struct v4l2loopback_private *ptr =
++		(struct v4l2loopback_private *)video_get_drvdata(loopdev);
++	int nr = ptr->device_nr;
++
++	return idr_find(&v4l2loopback_index_idr, nr);
++}
++
++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f)
++{
++	struct v4l2loopback_private *ptr = video_drvdata(f);
++	int nr = ptr->device_nr;
++
++	return idr_find(&v4l2loopback_index_idr, nr);
++}
++
++/* forward declarations */
++static void client_usage_queue_event(struct video_device *vdev);
++static void init_buffers(struct v4l2_loopback_device *dev);
++static int allocate_buffers(struct v4l2_loopback_device *dev);
++static void free_buffers(struct v4l2_loopback_device *dev);
++static void try_free_buffers(struct v4l2_loopback_device *dev);
++static int allocate_timeout_image(struct v4l2_loopback_device *dev);
++static void check_timers(struct v4l2_loopback_device *dev);
++static const struct v4l2_file_operations v4l2_loopback_fops;
++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops;
++
++/* Queue helpers */
++/* next functions sets buffer flags and adjusts counters accordingly */
++static inline void set_done(struct v4l2l_buffer *buffer)
++{
++	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
++	buffer->buffer.flags |= V4L2_BUF_FLAG_DONE;
++}
++
++static inline void set_queued(struct v4l2l_buffer *buffer)
++{
++	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
++	buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED;
++}
++
++static inline void unset_flags(struct v4l2l_buffer *buffer)
++{
++	buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED;
++	buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE;
++}
++
++/* V4L2 ioctl caps and params calls */
++/* returns device capabilities
++ * called on VIDIOC_QUERYCAP
++ */
++static int vidioc_querycap(struct file *file, void *priv,
++			   struct v4l2_capability *cap)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	int device_nr =
++		((struct v4l2loopback_private *)video_get_drvdata(dev->vdev))
++			->device_nr;
++	__u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE;
++
++	strlcpy(cap->driver, "v4l2 loopback", sizeof(cap->driver));
++	snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label);
++	snprintf(cap->bus_info, sizeof(cap->bus_info),
++		 "platform:v4l2loopback-%03d", device_nr);
++
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0)
++	/* since 3.1.0, the v4l2-core system is supposed to set the version */
++	cap->version = V4L2LOOPBACK_VERSION_CODE;
++#endif
++
++	if (dev->announce_all_caps) {
++		capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT;
++	} else {
++		if (dev->ready_for_capture) {
++			capabilities |= V4L2_CAP_VIDEO_CAPTURE;
++		}
++		if (dev->ready_for_output) {
++			capabilities |= V4L2_CAP_VIDEO_OUTPUT;
++		}
++	}
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
++	dev->vdev->device_caps =
++#endif /* >=linux-4.7.0 */
++		cap->device_caps = cap->capabilities = capabilities;
++
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0)
++	cap->capabilities |= V4L2_CAP_DEVICE_CAPS;
++#endif
++
++	memset(cap->reserved, 0, sizeof(cap->reserved));
++	return 0;
++}
++
++static int vidioc_enum_framesizes(struct file *file, void *fh,
++				  struct v4l2_frmsizeenum *argp)
++{
++	struct v4l2_loopback_device *dev;
++
++	/* there can be only one... */
++	if (argp->index)
++		return -EINVAL;
++
++	dev = v4l2loopback_getdevice(file);
++	if (dev->ready_for_capture) {
++		/* format has already been negotiated
++		 * cannot change during runtime
++		 */
++		if (argp->pixel_format != dev->pix_format.pixelformat)
++			return -EINVAL;
++
++		argp->type = V4L2_FRMSIZE_TYPE_DISCRETE;
++
++		argp->discrete.width = dev->pix_format.width;
++		argp->discrete.height = dev->pix_format.height;
++	} else {
++		/* if the format has not been negotiated yet, we accept anything
++		 */
++		if (NULL == format_by_fourcc(argp->pixel_format))
++			return -EINVAL;
++
++		argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS;
++
++		argp->stepwise.min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH;
++		argp->stepwise.min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT;
++
++		argp->stepwise.max_width = dev->max_width;
++		argp->stepwise.max_height = dev->max_height;
++
++		argp->stepwise.step_width = 1;
++		argp->stepwise.step_height = 1;
++	}
++	return 0;
++}
++
++/* returns frameinterval (fps) for the set resolution
++ * called on VIDIOC_ENUM_FRAMEINTERVALS
++ */
++static int vidioc_enum_frameintervals(struct file *file, void *fh,
++				      struct v4l2_frmivalenum *argp)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++
++	/* there can be only one... */
++	if (argp->index)
++		return -EINVAL;
++
++	if (dev->ready_for_capture) {
++		if (argp->width != dev->pix_format.width ||
++		    argp->height != dev->pix_format.height ||
++		    argp->pixel_format != dev->pix_format.pixelformat)
++			return -EINVAL;
++
++		argp->type = V4L2_FRMIVAL_TYPE_DISCRETE;
++		argp->discrete = dev->capture_param.timeperframe;
++	} else {
++		if (argp->width < V4L2LOOPBACK_SIZE_MIN_WIDTH ||
++		    argp->width > max_width ||
++		    argp->height < V4L2LOOPBACK_SIZE_MIN_HEIGHT ||
++		    argp->height > max_height ||
++		    NULL == format_by_fourcc(argp->pixel_format))
++			return -EINVAL;
++
++		argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS;
++		argp->stepwise.min.numerator = 1;
++		argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX;
++		argp->stepwise.max.numerator = 1;
++		argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN;
++		argp->stepwise.step.numerator = 1;
++		argp->stepwise.step.denominator = 1;
++	}
++
++	return 0;
++}
++
++/* ------------------ CAPTURE ----------------------- */
++
++/* returns device formats
++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
++ */
++static int vidioc_enum_fmt_cap(struct file *file, void *fh,
++			       struct v4l2_fmtdesc *f)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++
++	if (f->index)
++		return -EINVAL;
++	if (dev->ready_for_capture) {
++		const __u32 format = dev->pix_format.pixelformat;
++
++		snprintf(f->description, sizeof(f->description), "[%c%c%c%c]",
++			 (format >> 0) & 0xFF, (format >> 8) & 0xFF,
++			 (format >> 16) & 0xFF, (format >> 24) & 0xFF);
++
++		f->pixelformat = dev->pix_format.pixelformat;
++	} else {
++		return -EINVAL;
++	}
++	f->flags = 0;
++	MARK();
++	return 0;
++}
++
++/* returns current video format
++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
++ */
++static int vidioc_g_fmt_cap(struct file *file, void *priv,
++			    struct v4l2_format *fmt)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++
++	if (!dev->ready_for_capture)
++		return -EINVAL;
++
++	fmt->fmt.pix = dev->pix_format;
++	MARK();
++	return 0;
++}
++
++/* checks if it is OK to change to format fmt;
++ * actual check is done by inner_try_fmt_cap
++ * just checking that pixelformat is OK and set other parameters, app should
++ * obey this decision
++ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
++ */
++static int vidioc_try_fmt_cap(struct file *file, void *priv,
++			      struct v4l2_format *fmt)
++{
++	struct v4l2_loopback_device *dev;
++	char buf[5];
++
++	dev = v4l2loopback_getdevice(file);
++
++	if (0 == dev->ready_for_capture) {
++		dprintk("setting fmt_cap not possible yet\n");
++		return -EBUSY;
++	}
++
++	if (fmt->fmt.pix.pixelformat != dev->pix_format.pixelformat)
++		return -EINVAL;
++
++	fmt->fmt.pix = dev->pix_format;
++
++	buf[4] = 0;
++	dprintk("capFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf));
++	return 0;
++}
++
++/* sets new output format, if possible
++ * actually format is set  by input and we even do not check it, just return
++ * current one, but it is possible to set subregions of input TODO(vasaka)
++ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE
++ */
++static int vidioc_s_fmt_cap(struct file *file, void *priv,
++			    struct v4l2_format *fmt)
++{
++	return vidioc_try_fmt_cap(file, priv, fmt);
++}
++
++/* ------------------ OUTPUT ----------------------- */
++
++/* returns device formats;
++ * LATER: allow all formats
++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
++ */
++static int vidioc_enum_fmt_out(struct file *file, void *fh,
++			       struct v4l2_fmtdesc *f)
++{
++	struct v4l2_loopback_device *dev;
++	const struct v4l2l_format *fmt;
++
++	dev = v4l2loopback_getdevice(file);
++
++	if (dev->ready_for_capture) {
++		const __u32 format = dev->pix_format.pixelformat;
++
++		/* format has been fixed by the writer, so only one single format is supported */
++		if (f->index)
++			return -EINVAL;
++
++		fmt = format_by_fourcc(format);
++		if (NULL == fmt)
++			return -EINVAL;
++
++		/* f->flags = ??; */
++		snprintf(f->description, sizeof(f->description), "%s",
++			 fmt->name);
++
++		f->pixelformat = dev->pix_format.pixelformat;
++	} else {
++		/* fill in a dummy format */
++		/* coverity[unsigned_compare] */
++		if (f->index < 0 || f->index >= FORMATS)
++			return -EINVAL;
++
++		fmt = &formats[f->index];
++
++		f->pixelformat = fmt->fourcc;
++		snprintf(f->description, sizeof(f->description), "%s",
++			 fmt->name);
++	}
++	f->flags = 0;
++
++	return 0;
++}
++
++/* returns current video format format fmt */
++/* NOTE: this is called from the producer
++ * so if format has not been negotiated yet,
++ * it should return ALL of available formats,
++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
++ */
++static int vidioc_g_fmt_out(struct file *file, void *priv,
++			    struct v4l2_format *fmt)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++
++	/*
++	 * LATER: this should return the currently valid format
++	 * gstreamer doesn't like it, if this returns -EINVAL, as it
++	 * then concludes that there is _no_ valid format
++	 * CHECK whether this assumption is wrong,
++	 * or whether we have to always provide a valid format
++	 */
++
++	fmt->fmt.pix = dev->pix_format;
++	return 0;
++}
++
++/* checks if it is OK to change to format fmt;
++ * if format is negotiated do not change it
++ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
++ */
++static int vidioc_try_fmt_out(struct file *file, void *priv,
++			      struct v4l2_format *fmt)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++
++	/* TODO(vasaka) loopback does not care about formats writer want to set,
++	 * maybe it is a good idea to restrict format somehow */
++	if (dev->ready_for_capture) {
++		fmt->fmt.pix = dev->pix_format;
++	} else {
++		__u32 w = fmt->fmt.pix.width;
++		__u32 h = fmt->fmt.pix.height;
++		__u32 pixfmt = fmt->fmt.pix.pixelformat;
++		const struct v4l2l_format *format = format_by_fourcc(pixfmt);
++
++		if (w > dev->max_width)
++			w = dev->max_width;
++		if (h > dev->max_height)
++			h = dev->max_height;
++
++		dprintk("trying image %dx%d\n", w, h);
++
++		if (w < 1)
++			w = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH;
++
++		if (h < 1)
++			h = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT;
++
++		if (NULL == format)
++			format = &formats[0];
++
++		pix_format_set_size(&fmt->fmt.pix, format, w, h);
++
++		fmt->fmt.pix.pixelformat = format->fourcc;
++
++		if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) ||
++		    (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3))
++			fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB;
++
++		if (V4L2_FIELD_ANY == fmt->fmt.pix.field)
++			fmt->fmt.pix.field = V4L2_FIELD_NONE;
++
++		/* FIXXME: try_fmt should never modify the device-state */
++		dev->pix_format = fmt->fmt.pix;
++	}
++	return 0;
++}
++
++/* sets new output format, if possible;
++ * allocate data here because we do not know if it will be streaming or
++ * read/write IO
++ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT
++ */
++static int vidioc_s_fmt_out(struct file *file, void *priv,
++			    struct v4l2_format *fmt)
++{
++	struct v4l2_loopback_device *dev;
++	char buf[5];
++	int ret;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	ret = vidioc_try_fmt_out(file, priv, fmt);
++
++	dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture,
++		dev->pix_format.sizeimage);
++
++	buf[4] = 0;
++	dprintk("outFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf));
++
++	if (ret < 0)
++		return ret;
++
++	if (!dev->ready_for_capture) {
++		dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
++		fmt->fmt.pix.sizeimage = dev->buffer_size;
++		ret = allocate_buffers(dev);
++	}
++	return ret;
++}
++
++// #define V4L2L_OVERLAY
++#ifdef V4L2L_OVERLAY
++/* ------------------ OVERLAY ----------------------- */
++/* currently unsupported */
++/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work
++ * while it should only require it, if overlay is requested
++ * once the gstreamer element is fixed, remove the overlay dummies
++ */
++#warning OVERLAY dummies
++static int vidioc_g_fmt_overlay(struct file *file, void *priv,
++				struct v4l2_format *fmt)
++{
++	return 0;
++}
++
++static int vidioc_s_fmt_overlay(struct file *file, void *priv,
++				struct v4l2_format *fmt)
++{
++	return 0;
++}
++#endif /* V4L2L_OVERLAY */
++
++/* ------------------ PARAMs ----------------------- */
++
++/* get some data flow parameters, only capability, fps and readbuffers has
++ * effect on this driver
++ * called on VIDIOC_G_PARM
++ */
++static int vidioc_g_parm(struct file *file, void *priv,
++			 struct v4l2_streamparm *parm)
++{
++	/* do not care about type of opener, hope these enums would always be
++	 * compatible */
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	parm->parm.capture = dev->capture_param;
++	return 0;
++}
++
++/* get some data flow parameters, only capability, fps and readbuffers has
++ * effect on this driver
++ * called on VIDIOC_S_PARM
++ */
++static int vidioc_s_parm(struct file *file, void *priv,
++			 struct v4l2_streamparm *parm)
++{
++	struct v4l2_loopback_device *dev;
++	int err = 0;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	dprintk("vidioc_s_parm called frate=%d/%d\n",
++		parm->parm.capture.timeperframe.numerator,
++		parm->parm.capture.timeperframe.denominator);
++
++	switch (parm->type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if ((err = set_timeperframe(
++			     dev, &parm->parm.capture.timeperframe)) < 0)
++			return err;
++		break;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if ((err = set_timeperframe(
++			     dev, &parm->parm.capture.timeperframe)) < 0)
++			return err;
++		break;
++	default:
++		return -1;
++	}
++
++	parm->parm.capture = dev->capture_param;
++	return 0;
++}
++
++#ifdef V4L2LOOPBACK_WITH_STD
++/* sets a tv standard, actually we do not need to handle this any special way
++ * added to support effecttv
++ * called on VIDIOC_S_STD
++ */
++static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std)
++{
++	v4l2_std_id req_std = 0, supported_std = 0;
++	const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0;
++
++	if (_std) {
++		req_std = *_std;
++		*_std = all_std;
++	}
++
++	/* we support everything in V4L2_STD_ALL, but not more... */
++	supported_std = (all_std & req_std);
++	if (no_std == supported_std)
++		return -EINVAL;
++
++	return 0;
++}
++
++/* gets a fake video standard
++ * called on VIDIOC_G_STD
++ */
++static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm)
++{
++	if (norm)
++		*norm = V4L2_STD_ALL;
++	return 0;
++}
++/* gets a fake video standard
++ * called on VIDIOC_QUERYSTD
++ */
++static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm)
++{
++	if (norm)
++		*norm = V4L2_STD_ALL;
++	return 0;
++}
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id,
++				 s64 val)
++{
++	switch (id) {
++	case CID_KEEP_FORMAT:
++		if (val < 0 || val > 1)
++			return -EINVAL;
++		dev->keep_format = val;
++		try_free_buffers(
++			dev); /* will only free buffers if !keep_format */
++		break;
++	case CID_SUSTAIN_FRAMERATE:
++		if (val < 0 || val > 1)
++			return -EINVAL;
++		spin_lock_bh(&dev->lock);
++		dev->sustain_framerate = val;
++		check_timers(dev);
++		spin_unlock_bh(&dev->lock);
++		break;
++	case CID_TIMEOUT:
++		if (val < 0 || val > MAX_TIMEOUT)
++			return -EINVAL;
++		spin_lock_bh(&dev->lock);
++		dev->timeout_jiffies = msecs_to_jiffies(val);
++		check_timers(dev);
++		spin_unlock_bh(&dev->lock);
++		allocate_timeout_image(dev);
++		break;
++	case CID_TIMEOUT_IMAGE_IO:
++		if (val < 0 || val > 1)
++			return -EINVAL;
++		dev->timeout_image_io = val;
++		break;
++	default:
++		return -EINVAL;
++	}
++	return 0;
++}
++
++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl)
++{
++	struct v4l2_loopback_device *dev = container_of(
++		ctrl->handler, struct v4l2_loopback_device, ctrl_handler);
++	return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val);
++}
++
++/* returns set of device outputs, in our case there is only one
++ * called on VIDIOC_ENUMOUTPUT
++ */
++static int vidioc_enum_output(struct file *file, void *fh,
++			      struct v4l2_output *outp)
++{
++	__u32 index = outp->index;
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	MARK();
++
++	if (!dev->announce_all_caps && !dev->ready_for_output)
++		return -ENOTTY;
++
++	if (0 != index)
++		return -EINVAL;
++
++	/* clear all data (including the reserved fields) */
++	memset(outp, 0, sizeof(*outp));
++
++	outp->index = index;
++	strlcpy(outp->name, "loopback in", sizeof(outp->name));
++	outp->type = V4L2_OUTPUT_TYPE_ANALOG;
++	outp->audioset = 0;
++	outp->modulator = 0;
++#ifdef V4L2LOOPBACK_WITH_STD
++	outp->std = V4L2_STD_ALL;
++#ifdef V4L2_OUT_CAP_STD
++	outp->capabilities |= V4L2_OUT_CAP_STD;
++#endif /*  V4L2_OUT_CAP_STD */
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	return 0;
++}
++
++/* which output is currently active,
++ * called on VIDIOC_G_OUTPUT
++ */
++static int vidioc_g_output(struct file *file, void *fh, unsigned int *i)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	if (!dev->announce_all_caps && !dev->ready_for_output)
++		return -ENOTTY;
++	if (i)
++		*i = 0;
++	return 0;
++}
++
++/* set output, can make sense if we have more than one video src,
++ * called on VIDIOC_S_OUTPUT
++ */
++static int vidioc_s_output(struct file *file, void *fh, unsigned int i)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	if (!dev->announce_all_caps && !dev->ready_for_output)
++		return -ENOTTY;
++
++	if (i)
++		return -EINVAL;
++
++	return 0;
++}
++
++/* returns set of device inputs, in our case there is only one,
++ * but later I may add more
++ * called on VIDIOC_ENUMINPUT
++ */
++static int vidioc_enum_input(struct file *file, void *fh,
++			     struct v4l2_input *inp)
++{
++	__u32 index = inp->index;
++	MARK();
++
++	if (0 != index)
++		return -EINVAL;
++
++	/* clear all data (including the reserved fields) */
++	memset(inp, 0, sizeof(*inp));
++
++	inp->index = index;
++	strlcpy(inp->name, "loopback", sizeof(inp->name));
++	inp->type = V4L2_INPUT_TYPE_CAMERA;
++	inp->audioset = 0;
++	inp->tuner = 0;
++	inp->status = 0;
++
++#ifdef V4L2LOOPBACK_WITH_STD
++	inp->std = V4L2_STD_ALL;
++#ifdef V4L2_IN_CAP_STD
++	inp->capabilities |= V4L2_IN_CAP_STD;
++#endif
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	return 0;
++}
++
++/* which input is currently active,
++ * called on VIDIOC_G_INPUT
++ */
++static int vidioc_g_input(struct file *file, void *fh, unsigned int *i)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	if (!dev->announce_all_caps && !dev->ready_for_capture)
++		return -ENOTTY;
++	if (i)
++		*i = 0;
++	return 0;
++}
++
++/* set input, can make sense if we have more than one video src,
++ * called on VIDIOC_S_INPUT
++ */
++static int vidioc_s_input(struct file *file, void *fh, unsigned int i)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	if (!dev->announce_all_caps && !dev->ready_for_capture)
++		return -ENOTTY;
++	if (i == 0)
++		return 0;
++	return -EINVAL;
++}
++
++/* --------------- V4L2 ioctl buffer related calls ----------------- */
++
++/* negotiate buffer type
++ * only mmap streaming supported
++ * called on VIDIOC_REQBUFS
++ */
++static int vidioc_reqbufs(struct file *file, void *fh,
++			  struct v4l2_requestbuffers *b)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	int i;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++
++	dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count,
++		dev->buffers_number);
++	if (opener->timeout_image_io) {
++		if (b->memory != V4L2_MEMORY_MMAP)
++			return -EINVAL;
++		b->count = 1;
++		return 0;
++	}
++
++	init_buffers(dev);
++	switch (b->memory) {
++	case V4L2_MEMORY_MMAP:
++		/* do nothing here, buffers are always allocated */
++		if (b->count < 1 || dev->buffers_number < 1)
++			return 0;
++
++		if (b->count > dev->buffers_number)
++			b->count = dev->buffers_number;
++
++		/* make sure that outbufs_list contains buffers from 0 to used_buffers-1
++		 * actually, it will have been already populated via v4l2_loopback_init()
++		 * at this point */
++		if (list_empty(&dev->outbufs_list)) {
++			for (i = 0; i < dev->used_buffers; ++i)
++				list_add_tail(&dev->buffers[i].list_head,
++					      &dev->outbufs_list);
++		}
++
++		/* also, if dev->used_buffers is going to be decreased, we should remove
++		 * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */
++		if (b->count < dev->used_buffers) {
++			struct v4l2l_buffer *pos, *n;
++
++			list_for_each_entry_safe(pos, n, &dev->outbufs_list,
++						 list_head) {
++				if (pos->buffer.index >= b->count)
++					list_del(&pos->list_head);
++			}
++
++			/* after we update dev->used_buffers, buffers in outbufs_list will
++			 * correspond to dev->write_position + [0;b->count-1] range */
++			i = dev->write_position;
++			list_for_each_entry(pos, &dev->outbufs_list,
++					    list_head) {
++				dev->bufpos2index[mod_inc(&i, b->count)] =
++					pos->buffer.index;
++			}
++		}
++
++		opener->buffers_number = b->count;
++		if (opener->buffers_number < dev->used_buffers)
++			dev->used_buffers = opener->buffers_number;
++		return 0;
++	default:
++		return -EINVAL;
++	}
++}
++
++/* returns buffer asked for;
++ * give app as many buffers as it wants, if it less than MAX,
++ * but map them in our inner buffers
++ * called on VIDIOC_QUERYBUF
++ */
++static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b)
++{
++	enum v4l2_buf_type type;
++	int index;
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++
++	MARK();
++
++	type = b->type;
++	index = b->index;
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++
++	if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) &&
++	    (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) {
++		return -EINVAL;
++	}
++	if (b->index > max_buffers)
++		return -EINVAL;
++
++	if (opener->timeout_image_io)
++		*b = dev->timeout_image_buffer.buffer;
++	else
++		*b = dev->buffers[b->index % dev->used_buffers].buffer;
++
++	b->type = type;
++	b->index = index;
++	dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory,
++		  dev->buffers_number, dev->buffer_size);
++
++	/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
++            https://github.com/umlaeute/v4l2loopback/issues/60 */
++	b->flags &= ~V4L2_BUF_FLAG_DONE;
++	b->flags |= V4L2_BUF_FLAG_QUEUED;
++
++	return 0;
++}
++
++static void buffer_written(struct v4l2_loopback_device *dev,
++			   struct v4l2l_buffer *buf)
++{
++	del_timer_sync(&dev->sustain_timer);
++	del_timer_sync(&dev->timeout_timer);
++	spin_lock_bh(&dev->lock);
++
++	dev->bufpos2index[mod_inc(&dev->write_position, dev->used_buffers)] =
++		buf->buffer.index;
++	list_move_tail(&buf->list_head, &dev->outbufs_list);
++	dev->reread_count = 0;
++
++	check_timers(dev);
++	spin_unlock_bh(&dev->lock);
++}
++
++/* put buffer to queue
++ * called on VIDIOC_QBUF
++ */
++static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	struct v4l2l_buffer *b;
++	int index;
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++
++	if (buf->index > max_buffers)
++		return -EINVAL;
++	if (opener->timeout_image_io)
++		return 0;
++
++	index = buf->index % dev->used_buffers;
++	b = &dev->buffers[index];
++
++	switch (buf->type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		dprintkrw("capture QBUF index: %d\n", index);
++		set_queued(b);
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		dprintkrw("output QBUF pos: %d index: %d\n",
++			  dev->write_position, index);
++		if (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0)
++			v4l2l_get_timestamp(&b->buffer);
++		else
++			b->buffer.timestamp = buf->timestamp;
++		b->buffer.bytesused = buf->bytesused;
++		set_done(b);
++		buffer_written(dev, b);
++
++		/*  Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture'
++                    https://github.com/umlaeute/v4l2loopback/issues/60 */
++		buf->flags &= ~V4L2_BUF_FLAG_DONE;
++		buf->flags |= V4L2_BUF_FLAG_QUEUED;
++
++		wake_up_all(&dev->read_event);
++		return 0;
++	default:
++		return -EINVAL;
++	}
++}
++
++static int can_read(struct v4l2_loopback_device *dev,
++		    struct v4l2_loopback_opener *opener)
++{
++	int ret;
++
++	spin_lock_bh(&dev->lock);
++	check_timers(dev);
++	ret = dev->write_position > opener->read_position ||
++	      dev->reread_count > opener->reread_count || dev->timeout_happened;
++	spin_unlock_bh(&dev->lock);
++	return ret;
++}
++
++static int get_capture_buffer(struct file *file)
++{
++	struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file);
++	struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data);
++	int pos, ret;
++	int timeout_happened;
++
++	if ((file->f_flags & O_NONBLOCK) &&
++	    (dev->write_position <= opener->read_position &&
++	     dev->reread_count <= opener->reread_count &&
++	     !dev->timeout_happened))
++		return -EAGAIN;
++	wait_event_interruptible(dev->read_event, can_read(dev, opener));
++
++	spin_lock_bh(&dev->lock);
++	if (dev->write_position == opener->read_position) {
++		if (dev->reread_count > opener->reread_count + 2)
++			opener->reread_count = dev->reread_count - 1;
++		++opener->reread_count;
++		pos = (opener->read_position + dev->used_buffers - 1) %
++		      dev->used_buffers;
++	} else {
++		opener->reread_count = 0;
++		if (dev->write_position >
++		    opener->read_position + dev->used_buffers)
++			opener->read_position = dev->write_position - 1;
++		pos = mod_inc(&opener->read_position, dev->used_buffers);
++	}
++	timeout_happened = dev->timeout_happened;
++	dev->timeout_happened = 0;
++	spin_unlock_bh(&dev->lock);
++
++	ret = dev->bufpos2index[pos];
++	if (timeout_happened) {
++		if (ret < 0) {
++			dprintk("trying to return not mapped buf[%d]\n", ret);
++			return -EFAULT;
++		}
++		/* although allocated on-demand, timeout_image is freed only
++		 * in free_buffers(), so we don't need to worry about it being
++		 * deallocated suddenly */
++		memcpy(dev->image + dev->buffers[ret].buffer.m.offset,
++		       dev->timeout_image, dev->buffer_size);
++	}
++	return ret;
++}
++
++/* put buffer to dequeue
++ * called on VIDIOC_DQBUF
++ */
++static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	int index;
++	struct v4l2l_buffer *b;
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++	if (opener->timeout_image_io) {
++		*buf = dev->timeout_image_buffer.buffer;
++		return 0;
++	}
++
++	switch (buf->type) {
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		index = get_capture_buffer(file);
++		if (index < 0)
++			return index;
++		dprintkrw("capture DQBUF pos: %d index: %d\n",
++			  opener->read_position - 1, index);
++		if (!(dev->buffers[index].buffer.flags &
++		      V4L2_BUF_FLAG_MAPPED)) {
++			dprintk("trying to return not mapped buf[%d]\n", index);
++			return -EINVAL;
++		}
++		unset_flags(&dev->buffers[index]);
++		*buf = dev->buffers[index].buffer;
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer,
++			       list_head);
++		list_move_tail(&b->list_head, &dev->outbufs_list);
++		dprintkrw("output DQBUF index: %d\n", b->buffer.index);
++		unset_flags(b);
++		*buf = b->buffer;
++		buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT;
++		return 0;
++	default:
++		return -EINVAL;
++	}
++}
++
++/* ------------- STREAMING ------------------- */
++
++/* start streaming
++ * called on VIDIOC_STREAMON
++ */
++static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (!dev->ready_for_capture) {
++			int ret = allocate_buffers(dev);
++			if (ret < 0)
++				return ret;
++		}
++		opener->type = WRITER;
++		dev->ready_for_output = 0;
++		dev->ready_for_capture++;
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if (!dev->ready_for_capture)
++			return -EIO;
++		opener->type = READER;
++		dev->active_readers++;
++		client_usage_queue_event(dev->vdev);
++		return 0;
++	default:
++		return -EINVAL;
++	}
++	return -EINVAL;
++}
++
++/* stop streaming
++ * called on VIDIOC_STREAMOFF
++ */
++static int vidioc_streamoff(struct file *file, void *fh,
++			    enum v4l2_buf_type type)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++
++	MARK();
++	dprintk("%d\n", type);
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(fh);
++	switch (type) {
++	case V4L2_BUF_TYPE_VIDEO_OUTPUT:
++		if (dev->ready_for_capture > 0)
++			dev->ready_for_capture--;
++		return 0;
++	case V4L2_BUF_TYPE_VIDEO_CAPTURE:
++		if (opener->type == READER) {
++			opener->type = 0;
++			dev->active_readers--;
++			client_usage_queue_event(dev->vdev);
++		}
++		return 0;
++	default:
++		return -EINVAL;
++	}
++	return -EINVAL;
++}
++
++#ifdef CONFIG_VIDEO_V4L1_COMPAT
++static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p)
++{
++	struct v4l2_loopback_device *dev;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	p->frames = dev->buffers_number;
++	p->offsets[0] = 0;
++	p->offsets[1] = 0;
++	p->size = dev->buffer_size;
++	return 0;
++}
++#endif
++
++static void client_usage_queue_event(struct video_device *vdev)
++{
++	struct v4l2_event ev;
++	struct v4l2_loopback_device *dev;
++
++	dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device,
++			   v4l2_dev);
++
++	memset(&ev, 0, sizeof(ev));
++	ev.type = V4L2_EVENT_PRI_CLIENT_USAGE;
++	((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers;
++
++	v4l2_event_queue(vdev, &ev);
++}
++
++static int client_usage_ops_add(struct v4l2_subscribed_event *sev,
++				unsigned elems)
++{
++	if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL))
++		return 0;
++
++	client_usage_queue_event(sev->fh->vdev);
++	return 0;
++}
++
++static void client_usage_ops_replace(struct v4l2_event *old,
++				     const struct v4l2_event *new)
++{
++	*((struct v4l2_event_client_usage *)&old->u) =
++		*((struct v4l2_event_client_usage *)&new->u);
++}
++
++static void client_usage_ops_merge(const struct v4l2_event *old,
++				   struct v4l2_event *new)
++{
++	*((struct v4l2_event_client_usage *)&new->u) =
++		*((struct v4l2_event_client_usage *)&old->u);
++}
++
++const struct v4l2_subscribed_event_ops client_usage_ops = {
++	.add = client_usage_ops_add,
++	.replace = client_usage_ops_replace,
++	.merge = client_usage_ops_merge,
++};
++
++static int vidioc_subscribe_event(struct v4l2_fh *fh,
++				  const struct v4l2_event_subscription *sub)
++{
++	switch (sub->type) {
++	case V4L2_EVENT_CTRL:
++		return v4l2_ctrl_subscribe_event(fh, sub);
++	case V4L2_EVENT_PRI_CLIENT_USAGE:
++		return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops);
++	}
++
++	return -EINVAL;
++}
++
++/* file operations */
++static void vm_open(struct vm_area_struct *vma)
++{
++	struct v4l2l_buffer *buf;
++	MARK();
++
++	buf = vma->vm_private_data;
++	buf->use_count++;
++
++	buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED;
++}
++
++static void vm_close(struct vm_area_struct *vma)
++{
++	struct v4l2l_buffer *buf;
++	MARK();
++
++	buf = vma->vm_private_data;
++	buf->use_count--;
++
++	if (buf->use_count <= 0)
++		buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED;
++}
++
++static struct vm_operations_struct vm_ops = {
++	.open = vm_open,
++	.close = vm_close,
++};
++
++static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma)
++{
++	u8 *addr;
++	unsigned long start;
++	unsigned long size;
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	struct v4l2l_buffer *buffer = NULL;
++	MARK();
++
++	start = (unsigned long)vma->vm_start;
++	size = (unsigned long)(vma->vm_end - vma->vm_start);
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(file->private_data);
++
++	if (size > dev->buffer_size) {
++		dprintk("userspace tries to mmap too much, fail\n");
++		return -EINVAL;
++	}
++	if (opener->timeout_image_io) {
++		/* we are going to map the timeout_image_buffer */
++		if ((vma->vm_pgoff << PAGE_SHIFT) !=
++		    dev->buffer_size * MAX_BUFFERS) {
++			dprintk("invalid mmap offset for timeout_image_io mode\n");
++			return -EINVAL;
++		}
++	} else if ((vma->vm_pgoff << PAGE_SHIFT) >
++		   dev->buffer_size * (dev->buffers_number - 1)) {
++		dprintk("userspace tries to mmap too far, fail\n");
++		return -EINVAL;
++	}
++
++	/* FIXXXXXME: allocation should not happen here! */
++	if (NULL == dev->image)
++		if (allocate_buffers(dev) < 0)
++			return -EINVAL;
++
++	if (opener->timeout_image_io) {
++		buffer = &dev->timeout_image_buffer;
++		addr = dev->timeout_image;
++	} else {
++		int i;
++		for (i = 0; i < dev->buffers_number; ++i) {
++			buffer = &dev->buffers[i];
++			if ((buffer->buffer.m.offset >> PAGE_SHIFT) ==
++			    vma->vm_pgoff)
++				break;
++		}
++
++		if (i >= dev->buffers_number)
++			return -EINVAL;
++
++		addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT);
++	}
++
++	while (size > 0) {
++		struct page *page;
++
++		page = vmalloc_to_page(addr);
++
++		if (vm_insert_page(vma, start, page) < 0)
++			return -EAGAIN;
++
++		start += PAGE_SIZE;
++		addr += PAGE_SIZE;
++		size -= PAGE_SIZE;
++	}
++
++	vma->vm_ops = &vm_ops;
++	vma->vm_private_data = buffer;
++
++	vm_open(vma);
++
++	MARK();
++	return 0;
++}
++
++static unsigned int v4l2_loopback_poll(struct file *file,
++				       struct poll_table_struct *pts)
++{
++	struct v4l2_loopback_opener *opener;
++	struct v4l2_loopback_device *dev;
++	__poll_t req_events = poll_requested_events(pts);
++	int ret_mask = 0;
++	MARK();
++
++	opener = fh_to_opener(file->private_data);
++	dev = v4l2loopback_getdevice(file);
++
++	if (req_events & POLLPRI) {
++		if (!v4l2_event_pending(&opener->fh))
++			poll_wait(file, &opener->fh.wait, pts);
++		if (v4l2_event_pending(&opener->fh)) {
++			ret_mask |= POLLPRI;
++			if (!(req_events & DEFAULT_POLLMASK))
++				return ret_mask;
++		}
++	}
++
++	switch (opener->type) {
++	case WRITER:
++		ret_mask |= POLLOUT | POLLWRNORM;
++		break;
++	case READER:
++		if (!can_read(dev, opener)) {
++			if (ret_mask)
++				return ret_mask;
++			poll_wait(file, &dev->read_event, pts);
++		}
++		if (can_read(dev, opener))
++			ret_mask |= POLLIN | POLLRDNORM;
++		if (v4l2_event_pending(&opener->fh))
++			ret_mask |= POLLPRI;
++		break;
++	default:
++		break;
++	}
++
++	MARK();
++	return ret_mask;
++}
++
++/* do not want to limit device opens, it can be as many readers as user want,
++ * writers are limited by means of setting writer field */
++static int v4l2_loopback_open(struct file *file)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_opener *opener;
++	MARK();
++	dev = v4l2loopback_getdevice(file);
++	if (dev->open_count.counter >= dev->max_openers)
++		return -EBUSY;
++	/* kfree on close */
++	opener = kzalloc(sizeof(*opener), GFP_KERNEL);
++	if (opener == NULL)
++		return -ENOMEM;
++
++	atomic_inc(&dev->open_count);
++
++	opener->timeout_image_io = dev->timeout_image_io;
++	if (opener->timeout_image_io) {
++		int r = allocate_timeout_image(dev);
++
++		if (r < 0) {
++			dprintk("timeout image allocation failed\n");
++
++			atomic_dec(&dev->open_count);
++
++			kfree(opener);
++			return r;
++		}
++	}
++
++	dev->timeout_image_io = 0;
++
++	v4l2_fh_init(&opener->fh, video_devdata(file));
++	file->private_data = &opener->fh;
++
++	v4l2_fh_add(&opener->fh);
++	dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL);
++	MARK();
++	return 0;
++}
++
++static int v4l2_loopback_close(struct file *file)
++{
++	struct v4l2_loopback_opener *opener;
++	struct v4l2_loopback_device *dev;
++	int is_writer = 0, is_reader = 0;
++	MARK();
++
++	opener = fh_to_opener(file->private_data);
++	dev = v4l2loopback_getdevice(file);
++
++	if (WRITER == opener->type)
++		is_writer = 1;
++	if (READER == opener->type)
++		is_reader = 1;
++
++	atomic_dec(&dev->open_count);
++	if (dev->open_count.counter == 0) {
++		del_timer_sync(&dev->sustain_timer);
++		del_timer_sync(&dev->timeout_timer);
++	}
++	try_free_buffers(dev);
++
++	v4l2_fh_del(&opener->fh);
++	v4l2_fh_exit(&opener->fh);
++
++	kfree(opener);
++	if (is_writer)
++		dev->ready_for_output = 1;
++	if (is_reader) {
++		dev->active_readers--;
++		client_usage_queue_event(dev->vdev);
++	}
++	MARK();
++	return 0;
++}
++
++static ssize_t v4l2_loopback_read(struct file *file, char __user *buf,
++				  size_t count, loff_t *ppos)
++{
++	int read_index;
++	struct v4l2_loopback_device *dev;
++	struct v4l2_buffer *b;
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++
++	read_index = get_capture_buffer(file);
++	if (read_index < 0)
++		return read_index;
++	if (count > dev->buffer_size)
++		count = dev->buffer_size;
++	b = &dev->buffers[read_index].buffer;
++	if (count > b->bytesused)
++		count = b->bytesused;
++	if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset),
++			 count)) {
++		printk(KERN_ERR
++		       "v4l2-loopback: failed copy_to_user() in read buf\n");
++		return -EFAULT;
++	}
++	dprintkrw("leave v4l2_loopback_read()\n");
++	return count;
++}
++
++static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf,
++				   size_t count, loff_t *ppos)
++{
++	struct v4l2_loopback_opener *opener;
++	struct v4l2_loopback_device *dev;
++	int write_index;
++	struct v4l2_buffer *b;
++	int err = 0;
++
++	MARK();
++
++	dev = v4l2loopback_getdevice(file);
++	opener = fh_to_opener(file->private_data);
++
++	if (UNNEGOTIATED == opener->type) {
++		spin_lock(&dev->lock);
++
++		if (dev->ready_for_output) {
++			err = vidioc_streamon(file, file->private_data,
++					      V4L2_BUF_TYPE_VIDEO_OUTPUT);
++		}
++
++		spin_unlock(&dev->lock);
++
++		if (err < 0)
++			return err;
++	}
++
++	if (WRITER != opener->type)
++		return -EINVAL;
++
++	if (!dev->ready_for_capture) {
++		int ret = allocate_buffers(dev);
++		if (ret < 0)
++			return ret;
++		dev->ready_for_capture = 1;
++	}
++	dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count);
++	if (count > dev->buffer_size)
++		count = dev->buffer_size;
++
++	write_index = dev->write_position % dev->used_buffers;
++	b = &dev->buffers[write_index].buffer;
++
++	if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf,
++			   count)) {
++		printk(KERN_ERR
++		       "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n",
++		       count);
++		return -EFAULT;
++	}
++	v4l2l_get_timestamp(b);
++	b->bytesused = count;
++	b->sequence = dev->write_position;
++	buffer_written(dev, &dev->buffers[write_index]);
++	wake_up_all(&dev->read_event);
++	dprintkrw("leave v4l2_loopback_write()\n");
++	return count;
++}
++
++/* init functions */
++/* frees buffers, if already allocated */
++static void free_buffers(struct v4l2_loopback_device *dev)
++{
++	MARK();
++	dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev);
++	if (!dev)
++		return;
++	if (dev->image) {
++		vfree(dev->image);
++		dev->image = NULL;
++	}
++	if (dev->timeout_image) {
++		vfree(dev->timeout_image);
++		dev->timeout_image = NULL;
++	}
++	dev->imagesize = 0;
++}
++/* frees buffers, if they are no longer needed */
++static void try_free_buffers(struct v4l2_loopback_device *dev)
++{
++	MARK();
++	if (0 == dev->open_count.counter && !dev->keep_format) {
++		free_buffers(dev);
++		dev->ready_for_capture = 0;
++		dev->buffer_size = 0;
++		dev->write_position = 0;
++	}
++}
++/* allocates buffers, if buffer_size is set */
++static int allocate_buffers(struct v4l2_loopback_device *dev)
++{
++	int err;
++
++	MARK();
++	/* vfree on close file operation in case no open handles left */
++
++	if (dev->buffer_size < 1 || dev->buffers_number < 1)
++		return -EINVAL;
++
++	if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number)
++		return -ENOSPC;
++
++	if (dev->image) {
++		dprintk("allocating buffers again: %ld %ld\n",
++			dev->buffer_size * dev->buffers_number, dev->imagesize);
++		/* FIXME: prevent double allocation more intelligently! */
++		if (dev->buffer_size * dev->buffers_number == dev->imagesize)
++			return 0;
++
++		/* if there is only one writer, no problem should occur */
++		if (dev->open_count.counter == 1)
++			free_buffers(dev);
++		else
++			return -EINVAL;
++	}
++
++	dev->imagesize = (unsigned long)dev->buffer_size *
++			 (unsigned long)dev->buffers_number;
++
++	dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size,
++		dev->buffers_number);
++	err = -ENOMEM;
++
++	if (dev->timeout_jiffies > 0) {
++		err = allocate_timeout_image(dev);
++		if (err < 0)
++			goto error;
++	}
++
++	dev->image = vmalloc(dev->imagesize);
++	if (dev->image == NULL)
++		goto error;
++
++	dprintk("vmallocated %ld bytes\n", dev->imagesize);
++	MARK();
++
++	init_buffers(dev);
++	return 0;
++
++error:
++	free_buffers(dev);
++	return err;
++}
++
++/* init inner buffers, they are capture mode and flags are set as
++ * for capture mod buffers */
++static void init_buffers(struct v4l2_loopback_device *dev)
++{
++	int i;
++	int buffer_size;
++	int bytesused;
++	MARK();
++
++	buffer_size = dev->buffer_size;
++	bytesused = dev->pix_format.sizeimage;
++
++	for (i = 0; i < dev->buffers_number; ++i) {
++		struct v4l2_buffer *b = &dev->buffers[i].buffer;
++		b->index = i;
++		b->bytesused = bytesused;
++		b->length = buffer_size;
++		b->field = V4L2_FIELD_NONE;
++		b->flags = 0;
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1)
++		b->input = 0;
++#endif
++		b->m.offset = i * buffer_size;
++		b->memory = V4L2_MEMORY_MMAP;
++		b->sequence = 0;
++		b->timestamp.tv_sec = 0;
++		b->timestamp.tv_usec = 0;
++		b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
++
++		v4l2l_get_timestamp(b);
++	}
++	dev->timeout_image_buffer = dev->buffers[0];
++	dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size;
++	MARK();
++}
++
++static int allocate_timeout_image(struct v4l2_loopback_device *dev)
++{
++	MARK();
++	if (dev->buffer_size <= 0)
++		return -EINVAL;
++
++	if (dev->timeout_image == NULL) {
++		dev->timeout_image = vzalloc(dev->buffer_size);
++		if (dev->timeout_image == NULL)
++			return -ENOMEM;
++	}
++	return 0;
++}
++
++/* fills and register video device */
++static void init_vdev(struct video_device *vdev, int nr)
++{
++	MARK();
++
++#ifdef V4L2LOOPBACK_WITH_STD
++	vdev->tvnorms = V4L2_STD_ALL;
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	vdev->vfl_type = VFL_TYPE_VIDEO;
++	vdev->fops = &v4l2_loopback_fops;
++	vdev->ioctl_ops = &v4l2_loopback_ioctl_ops;
++	vdev->release = &video_device_release;
++	vdev->minor = -1;
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0)
++	vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE |
++			    V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE |
++			    V4L2_CAP_STREAMING;
++#endif
++
++	if (debug > 1)
++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 20, 0)
++		vdev->debug = V4L2_DEBUG_IOCTL | V4L2_DEBUG_IOCTL_ARG;
++#else
++		vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL |
++				  V4L2_DEV_DEBUG_IOCTL_ARG;
++#endif
++
++		/* since kernel-3.7, there is a new field 'vfl_dir' that has to be
++	 * set to VFL_DIR_M2M for bidirectional devices */
++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0)
++	vdev->vfl_dir = VFL_DIR_M2M;
++#endif
++
++	MARK();
++}
++
++/* init default capture parameters, only fps may be changed in future */
++static void init_capture_param(struct v4l2_captureparm *capture_param)
++{
++	MARK();
++	capture_param->capability = 0;
++	capture_param->capturemode = 0;
++	capture_param->extendedmode = 0;
++	capture_param->readbuffers = max_buffers;
++	capture_param->timeperframe.numerator = 1;
++	capture_param->timeperframe.denominator = 30;
++}
++
++static void check_timers(struct v4l2_loopback_device *dev)
++{
++	if (!dev->ready_for_capture)
++		return;
++
++	if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer))
++		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
++	if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer))
++		mod_timer(&dev->sustain_timer,
++			  jiffies + dev->frame_jiffies * 3 / 2);
++}
++#ifdef HAVE_TIMER_SETUP
++static void sustain_timer_clb(struct timer_list *t)
++{
++	struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer);
++#else
++static void sustain_timer_clb(unsigned long nr)
++{
++	struct v4l2_loopback_device *dev =
++		idr_find(&v4l2loopback_index_idr, nr);
++#endif
++	spin_lock(&dev->lock);
++	if (dev->sustain_framerate) {
++		dev->reread_count++;
++		dprintkrw("reread: %d %d\n", dev->write_position,
++			  dev->reread_count);
++		if (dev->reread_count == 1)
++			mod_timer(&dev->sustain_timer,
++				  jiffies + max(1UL, dev->frame_jiffies / 2));
++		else
++			mod_timer(&dev->sustain_timer,
++				  jiffies + dev->frame_jiffies);
++		wake_up_all(&dev->read_event);
++	}
++	spin_unlock(&dev->lock);
++}
++#ifdef HAVE_TIMER_SETUP
++static void timeout_timer_clb(struct timer_list *t)
++{
++	struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer);
++#else
++static void timeout_timer_clb(unsigned long nr)
++{
++	struct v4l2_loopback_device *dev =
++		idr_find(&v4l2loopback_index_idr, nr);
++#endif
++	spin_lock(&dev->lock);
++	if (dev->timeout_jiffies > 0) {
++		dev->timeout_happened = 1;
++		mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies);
++		wake_up_all(&dev->read_event);
++	}
++	spin_unlock(&dev->lock);
++}
++
++/* init loopback main structure */
++#define DEFAULT_FROM_CONF(confmember, default_condition, default_value)        \
++	((conf) ?                                                              \
++		 ((conf->confmember default_condition) ? (default_value) :     \
++							 (conf->confmember)) : \
++		 default_value)
++
++static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_ctrl_handler *hdl;
++	struct v4l2loopback_private *vdev_priv = NULL;
++
++	int err = -ENOMEM;
++
++	int _max_width = DEFAULT_FROM_CONF(
++		max_width, < V4L2LOOPBACK_SIZE_MIN_WIDTH, max_width);
++	int _max_height = DEFAULT_FROM_CONF(
++		max_height, < V4L2LOOPBACK_SIZE_MIN_HEIGHT, max_height);
++	bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ?
++					  (conf->announce_all_caps) :
++					  V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS;
++	int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers);
++	int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers);
++
++	int nr = -1;
++
++	_announce_all_caps = (!!_announce_all_caps);
++
++	if (conf) {
++		if (conf->capture_nr >= 0 &&
++		    conf->output_nr == conf->capture_nr) {
++			nr = conf->capture_nr;
++		} else if (conf->capture_nr < 0 && conf->output_nr < 0) {
++			nr = -1;
++		} else if (conf->capture_nr < 0) {
++			nr = conf->output_nr;
++		} else if (conf->output_nr < 0) {
++			nr = conf->capture_nr;
++		} else {
++			printk(KERN_ERR
++			       "split OUTPUT and CAPTURE devices not yet supported.");
++			printk(KERN_INFO
++			       "both devices must have the same number (%d != %d).",
++			       conf->output_nr, conf->capture_nr);
++			return -EINVAL;
++		}
++	}
++
++	if (idr_find(&v4l2loopback_index_idr, nr))
++		return -EEXIST;
++
++	dprintk("creating v4l2loopback-device #%d\n", nr);
++	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
++	if (!dev)
++		return -ENOMEM;
++
++	/* allocate id, if @id >= 0, we're requesting that specific id */
++	if (nr >= 0) {
++		err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1,
++				GFP_KERNEL);
++		if (err == -ENOSPC)
++			err = -EEXIST;
++	} else {
++		err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL);
++	}
++	if (err < 0)
++		goto out_free_dev;
++	nr = err;
++	err = -ENOMEM;
++
++	if (conf && conf->card_label[0]) {
++		snprintf(dev->card_label, sizeof(dev->card_label), "%s",
++			 conf->card_label);
++	} else {
++		snprintf(dev->card_label, sizeof(dev->card_label),
++			 "Dummy video device (0x%04X)", nr);
++	}
++	snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name),
++		 "v4l2loopback-%03d", nr);
++
++	err = v4l2_device_register(NULL, &dev->v4l2_dev);
++	if (err)
++		goto out_free_idr;
++	MARK();
++
++	dev->vdev = video_device_alloc();
++	if (dev->vdev == NULL) {
++		err = -ENOMEM;
++		goto out_unregister;
++	}
++
++	vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL);
++	if (vdev_priv == NULL) {
++		err = -ENOMEM;
++		goto out_unregister;
++	}
++
++	video_set_drvdata(dev->vdev, vdev_priv);
++	if (video_get_drvdata(dev->vdev) == NULL) {
++		err = -ENOMEM;
++		goto out_unregister;
++	}
++
++	MARK();
++	snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s",
++		 dev->card_label);
++
++	vdev_priv->device_nr = nr;
++
++	init_vdev(dev->vdev, nr);
++	dev->vdev->v4l2_dev = &dev->v4l2_dev;
++	init_capture_param(&dev->capture_param);
++	err = set_timeperframe(dev, &dev->capture_param.timeperframe);
++	if (err)
++		goto out_unregister;
++	dev->keep_format = 0;
++	dev->sustain_framerate = 0;
++
++	dev->announce_all_caps = _announce_all_caps;
++	dev->max_width = _max_width;
++	dev->max_height = _max_height;
++	dev->max_openers = _max_openers;
++	dev->buffers_number = dev->used_buffers = _max_buffers;
++
++	dev->write_position = 0;
++
++	MARK();
++	spin_lock_init(&dev->lock);
++	INIT_LIST_HEAD(&dev->outbufs_list);
++	if (list_empty(&dev->outbufs_list)) {
++		int i;
++
++		for (i = 0; i < dev->used_buffers; ++i)
++			list_add_tail(&dev->buffers[i].list_head,
++				      &dev->outbufs_list);
++	}
++	memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index));
++	atomic_set(&dev->open_count, 0);
++	dev->ready_for_capture = 0;
++	dev->ready_for_output = 1;
++
++	dev->buffer_size = 0;
++	dev->image = NULL;
++	dev->imagesize = 0;
++#ifdef HAVE_TIMER_SETUP
++	timer_setup(&dev->sustain_timer, sustain_timer_clb, 0);
++	timer_setup(&dev->timeout_timer, timeout_timer_clb, 0);
++#else
++	setup_timer(&dev->sustain_timer, sustain_timer_clb, nr);
++	setup_timer(&dev->timeout_timer, timeout_timer_clb, nr);
++#endif
++	dev->reread_count = 0;
++	dev->timeout_jiffies = 0;
++	dev->timeout_image = NULL;
++	dev->timeout_happened = 0;
++
++	hdl = &dev->ctrl_handler;
++	err = v4l2_ctrl_handler_init(hdl, 4);
++	if (err)
++		goto out_unregister;
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL);
++	v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL);
++	if (hdl->error) {
++		err = hdl->error;
++		goto out_free_handler;
++	}
++	dev->v4l2_dev.ctrl_handler = hdl;
++
++	err = v4l2_ctrl_handler_setup(hdl);
++	if (err)
++		goto out_free_handler;
++
++	/* FIXME set buffers to 0 */
++
++	/* Set initial format */
++	dev->pix_format.width = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; */
++	dev->pix_format.height = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; */
++	dev->pix_format.pixelformat = formats[0].fourcc;
++	dev->pix_format.colorspace =
++		V4L2_COLORSPACE_SRGB; /* do we need to set this ? */
++	dev->pix_format.field = V4L2_FIELD_NONE;
++
++	dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage);
++	dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size,
++		dev->pix_format.sizeimage);
++
++	if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0))
++		goto out_free_handler;
++
++	init_waitqueue_head(&dev->read_event);
++
++	/* register the device -> it creates /dev/video* */
++	if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) {
++		printk(KERN_ERR
++		       "v4l2loopback: failed video_register_device()\n");
++		err = -EFAULT;
++		goto out_free_device;
++	}
++	v4l2loopback_create_sysfs(dev->vdev);
++
++	MARK();
++	if (ret_nr)
++		*ret_nr = dev->vdev->num;
++	return 0;
++
++out_free_device:
++	video_device_release(dev->vdev);
++out_free_handler:
++	v4l2_ctrl_handler_free(&dev->ctrl_handler);
++out_unregister:
++	video_set_drvdata(dev->vdev, NULL);
++	if (vdev_priv != NULL)
++		kfree(vdev_priv);
++	v4l2_device_unregister(&dev->v4l2_dev);
++out_free_idr:
++	idr_remove(&v4l2loopback_index_idr, nr);
++out_free_dev:
++	kfree(dev);
++	return err;
++}
++
++static void v4l2_loopback_remove(struct v4l2_loopback_device *dev)
++{
++	free_buffers(dev);
++	v4l2loopback_remove_sysfs(dev->vdev);
++	kfree(video_get_drvdata(dev->vdev));
++	video_unregister_device(dev->vdev);
++	v4l2_device_unregister(&dev->v4l2_dev);
++	v4l2_ctrl_handler_free(&dev->ctrl_handler);
++	kfree(dev);
++}
++
++static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd,
++				       unsigned long parm)
++{
++	struct v4l2_loopback_device *dev;
++	struct v4l2_loopback_config conf;
++	struct v4l2_loopback_config *confptr = &conf;
++	int device_nr;
++	int ret;
++
++	ret = mutex_lock_killable(&v4l2loopback_ctl_mutex);
++	if (ret)
++		return ret;
++
++	ret = -EINVAL;
++	switch (cmd) {
++	default:
++		ret = -ENOSYS;
++		break;
++		/* add a v4l2loopback device (pair), based on the user-provided specs */
++	case V4L2LOOPBACK_CTL_ADD:
++		if (parm) {
++			if ((ret = copy_from_user(&conf, (void *)parm,
++						  sizeof(conf))) < 0)
++				break;
++		} else
++			confptr = NULL;
++		ret = v4l2_loopback_add(confptr, &device_nr);
++		if (ret >= 0)
++			ret = device_nr;
++		break;
++		/* remove a v4l2loopback device (both capture and output) */
++	case V4L2LOOPBACK_CTL_REMOVE:
++		ret = v4l2loopback_lookup((int)parm, &dev);
++		if (ret >= 0 && dev) {
++			int nr = ret;
++			ret = -EBUSY;
++			if (dev->open_count.counter > 0)
++				break;
++			idr_remove(&v4l2loopback_index_idr, nr);
++			v4l2_loopback_remove(dev);
++			ret = 0;
++		};
++		break;
++		/* get information for a loopback device.
++                 * this is mostly about limits (which cannot be queried directly with  VIDIOC_G_FMT and friends
++                 */
++	case V4L2LOOPBACK_CTL_QUERY:
++		if (!parm)
++			break;
++		if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) <
++		    0)
++			break;
++		device_nr = (conf.output_nr < 0) ? conf.capture_nr :
++						   conf.output_nr;
++		MARK();
++		/* get the device from either capture_nr or output_nr (whatever is valid) */
++		if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0)
++			break;
++		MARK();
++		/* if we got the device from output_nr and there is a valid capture_nr,
++                 * make sure that both refer to the same device (or bail out)
++                 */
++		if ((device_nr != conf.capture_nr) && (conf.capture_nr >= 0) &&
++		    (ret != v4l2loopback_lookup(conf.capture_nr, 0)))
++			break;
++		MARK();
++		/* if otoh, we got the device from capture_nr and there is a valid output_nr,
++                 * make sure that both refer to the same device (or bail out)
++                 */
++		if ((device_nr != conf.output_nr) && (conf.output_nr >= 0) &&
++		    (ret != v4l2loopback_lookup(conf.output_nr, 0)))
++			break;
++		MARK();
++
++		/* v4l2_loopback_config identified a single device, so fetch the data */
++		snprintf(conf.card_label, sizeof(conf.card_label), "%s",
++			 dev->card_label);
++		MARK();
++		conf.output_nr = conf.capture_nr = dev->vdev->num;
++		conf.max_width = dev->max_width;
++		conf.max_height = dev->max_height;
++		conf.announce_all_caps = dev->announce_all_caps;
++		conf.max_buffers = dev->buffers_number;
++		conf.max_openers = dev->max_openers;
++		conf.debug = debug;
++		MARK();
++		if (copy_to_user((void *)parm, &conf, sizeof(conf))) {
++			ret = -EFAULT;
++			break;
++		}
++		MARK();
++		ret = 0;
++		;
++		break;
++	}
++
++	MARK();
++	mutex_unlock(&v4l2loopback_ctl_mutex);
++	MARK();
++	return ret;
++}
++
++/* LINUX KERNEL */
++
++static const struct file_operations v4l2loopback_ctl_fops = {
++	// clang-format off
++	.owner		= THIS_MODULE,
++	.open		= nonseekable_open,
++	.unlocked_ioctl	= v4l2loopback_control_ioctl,
++	.compat_ioctl	= v4l2loopback_control_ioctl,
++	.llseek		= noop_llseek,
++	// clang-format on
++};
++
++static struct miscdevice v4l2loopback_misc = {
++	// clang-format off
++	.minor		= MISC_DYNAMIC_MINOR,
++	.name		= "v4l2loopback",
++	.fops		= &v4l2loopback_ctl_fops,
++	// clang-format on
++};
++
++static const struct v4l2_file_operations v4l2_loopback_fops = {
++	// clang-format off
++	.owner		= THIS_MODULE,
++	.open		= v4l2_loopback_open,
++	.release	= v4l2_loopback_close,
++	.read		= v4l2_loopback_read,
++	.write		= v4l2_loopback_write,
++	.poll		= v4l2_loopback_poll,
++	.mmap		= v4l2_loopback_mmap,
++	.unlocked_ioctl	= video_ioctl2,
++	// clang-format on
++};
++
++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = {
++	// clang-format off
++	.vidioc_querycap		= &vidioc_querycap,
++	.vidioc_enum_framesizes		= &vidioc_enum_framesizes,
++	.vidioc_enum_frameintervals	= &vidioc_enum_frameintervals,
++
++	.vidioc_enum_output		= &vidioc_enum_output,
++	.vidioc_g_output		= &vidioc_g_output,
++	.vidioc_s_output		= &vidioc_s_output,
++
++	.vidioc_enum_input		= &vidioc_enum_input,
++	.vidioc_g_input			= &vidioc_g_input,
++	.vidioc_s_input			= &vidioc_s_input,
++
++	.vidioc_enum_fmt_vid_cap	= &vidioc_enum_fmt_cap,
++	.vidioc_g_fmt_vid_cap		= &vidioc_g_fmt_cap,
++	.vidioc_s_fmt_vid_cap		= &vidioc_s_fmt_cap,
++	.vidioc_try_fmt_vid_cap		= &vidioc_try_fmt_cap,
++
++	.vidioc_enum_fmt_vid_out	= &vidioc_enum_fmt_out,
++	.vidioc_s_fmt_vid_out		= &vidioc_s_fmt_out,
++	.vidioc_g_fmt_vid_out		= &vidioc_g_fmt_out,
++	.vidioc_try_fmt_vid_out		= &vidioc_try_fmt_out,
++
++#ifdef V4L2L_OVERLAY
++	.vidioc_s_fmt_vid_overlay	= &vidioc_s_fmt_overlay,
++	.vidioc_g_fmt_vid_overlay	= &vidioc_g_fmt_overlay,
++#endif
++
++#ifdef V4L2LOOPBACK_WITH_STD
++	.vidioc_s_std			= &vidioc_s_std,
++	.vidioc_g_std			= &vidioc_g_std,
++	.vidioc_querystd		= &vidioc_querystd,
++#endif /* V4L2LOOPBACK_WITH_STD */
++
++	.vidioc_g_parm			= &vidioc_g_parm,
++	.vidioc_s_parm			= &vidioc_s_parm,
++
++	.vidioc_reqbufs			= &vidioc_reqbufs,
++	.vidioc_querybuf		= &vidioc_querybuf,
++	.vidioc_qbuf			= &vidioc_qbuf,
++	.vidioc_dqbuf			= &vidioc_dqbuf,
++
++	.vidioc_streamon		= &vidioc_streamon,
++	.vidioc_streamoff		= &vidioc_streamoff,
++
++#ifdef CONFIG_VIDEO_V4L1_COMPAT
++	.vidiocgmbuf			= &vidiocgmbuf,
++#endif
++
++	.vidioc_subscribe_event		= &vidioc_subscribe_event,
++	.vidioc_unsubscribe_event	= &v4l2_event_unsubscribe,
++	// clang-format on
++};
++
++static int free_device_cb(int id, void *ptr, void *data)
++{
++	struct v4l2_loopback_device *dev = ptr;
++	v4l2_loopback_remove(dev);
++	return 0;
++}
++static void free_devices(void)
++{
++	idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL);
++	idr_destroy(&v4l2loopback_index_idr);
++}
++
++static int __init v4l2loopback_init_module(void)
++{
++	int err;
++	int i;
++	MARK();
++
++	err = misc_register(&v4l2loopback_misc);
++	if (err < 0)
++		return err;
++
++	if (devices < 0) {
++		devices = 1;
++
++		/* try guessing the devices from the "video_nr" parameter */
++		for (i = MAX_DEVICES - 1; i >= 0; i--) {
++			if (video_nr[i] >= 0) {
++				devices = i + 1;
++				break;
++			}
++		}
++	}
++
++	if (devices > MAX_DEVICES) {
++		devices = MAX_DEVICES;
++		printk(KERN_INFO
++		       "v4l2loopback: number of initial devices is limited to: %d\n",
++		       MAX_DEVICES);
++	}
++
++	if (max_buffers > MAX_BUFFERS) {
++		max_buffers = MAX_BUFFERS;
++		printk(KERN_INFO
++		       "v4l2loopback: number of buffers is limited to: %d\n",
++		       MAX_BUFFERS);
++	}
++
++	if (max_openers < 0) {
++		printk(KERN_INFO
++		       "v4l2loopback: allowing %d openers rather than %d\n",
++		       2, max_openers);
++		max_openers = 2;
++	}
++
++	if (max_width < V4L2LOOPBACK_SIZE_MIN_WIDTH) {
++		max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH;
++		printk(KERN_INFO "v4l2loopback: using max_width %d\n",
++		       max_width);
++	}
++	if (max_height < V4L2LOOPBACK_SIZE_MIN_HEIGHT) {
++		max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT;
++		printk(KERN_INFO "v4l2loopback: using max_height %d\n",
++		       max_height);
++	}
++
++	/* kfree on module release */
++	for (i = 0; i < devices; i++) {
++		struct v4l2_loopback_config cfg = {
++			// clang-format off
++			.output_nr		= video_nr[i],
++			.capture_nr		= video_nr[i],
++			.max_width		= max_width,
++			.max_height		= max_height,
++			.announce_all_caps	= (!exclusive_caps[i]),
++			.max_buffers		= max_buffers,
++			.max_openers		= max_openers,
++			.debug			= debug,
++			// clang-format on
++		};
++		cfg.card_label[0] = 0;
++		if (card_label[i])
++			snprintf(cfg.card_label, sizeof(cfg.card_label), "%s",
++				 card_label[i]);
++		err = v4l2_loopback_add(&cfg, 0);
++		if (err) {
++			free_devices();
++			goto error;
++		}
++	}
++
++	dprintk("module installed\n");
++
++	printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n",
++	       // clang-format off
++	       (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff,
++	       (V4L2LOOPBACK_VERSION_CODE >>  8) & 0xff,
++	       (V4L2LOOPBACK_VERSION_CODE      ) & 0xff,
++#ifdef SNAPSHOT_VERSION
++	       " (" __stringify(SNAPSHOT_VERSION) ")"
++#else
++	       ""
++#endif
++	       );
++	// clang-format on
++
++	return 0;
++error:
++	misc_deregister(&v4l2loopback_misc);
++	return err;
++}
++
++static void v4l2loopback_cleanup_module(void)
++{
++	MARK();
++	/* unregister the device -> it deletes /dev/video* */
++	free_devices();
++	/* and get rid of /dev/v4l2loopback */
++	misc_deregister(&v4l2loopback_misc);
++	dprintk("module removed\n");
++}
++
++MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR);
++
++module_init(v4l2loopback_init_module);
++module_exit(v4l2loopback_cleanup_module);
+diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h
+new file mode 100644
+index 000000000000..10f8e662d37a
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback.h
+@@ -0,0 +1,96 @@
++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
++/*
++ * v4l2loopback.h
++ *
++ * Written by IOhannes m zmölnig, 7/1/20.
++ *
++ * Copyright 2020 by IOhannes m zmölnig.  Redistribution of this file is
++ * permitted under the GNU General Public License.
++ */
++#ifndef _V4L2LOOPBACK_H
++#define _V4L2LOOPBACK_H
++
++#define V4L2LOOPBACK_VERSION_MAJOR 0
++#define V4L2LOOPBACK_VERSION_MINOR 12
++#define V4L2LOOPBACK_VERSION_BUGFIX 7
++
++/* /dev/v4l2loopback interface */
++
++struct v4l2_loopback_config {
++	/**
++         * the device-number (/dev/video<nr>)
++         * V4L2LOOPBACK_CTL_ADD:
++         * setting this to a value<0, will allocate an available one
++         * if nr>=0 and the device already exists, the ioctl will EEXIST
++         * if output_nr and capture_nr are the same, only a single device will be created
++	 * NOTE: currently split-devices (where output_nr and capture_nr differ)
++	 *   are not implemented yet.
++	 *   until then, requesting different device-IDs will result in EINVAL.
++         *
++         * V4L2LOOPBACK_CTL_QUERY:
++         * either both output_nr and capture_nr must refer to the same loopback,
++         * or one (and only one) of them must be -1
++         *
++         */
++	int output_nr;
++	int capture_nr;
++
++	/**
++         * a nice name for your device
++         * if (*card_label)==0, an automatic name is assigned
++         */
++	char card_label[32];
++
++	/**
++         * maximum allowed frame size
++         * if too low, default values are used
++         */
++	int max_width;
++	int max_height;
++
++	/**
++         * number of buffers to allocate for the queue
++         * if set to <=0, default values are used
++         */
++	int max_buffers;
++
++	/**
++         * how many consumers are allowed to open this device concurrently
++         * if set to <=0, default values are used
++         */
++	int max_openers;
++
++	/**
++         * set the debugging level for this device
++         */
++	int debug;
++
++	/**
++         * whether to announce OUTPUT/CAPTURE capabilities exclusively
++         * for this device or not
++         * (!exclusive_caps)
++	 * NOTE: this is going to be removed once separate output/capture
++	 *       devices are implemented
++         */
++	int announce_all_caps;
++};
++
++/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the
++ * to-be-created device set.
++ * if the ptr is NULL, a new device is created with default values at the driver's discretion.
++ *
++ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY,
++ * to get more information on the device)
++ */
++#define V4L2LOOPBACK_CTL_ADD 0x4C80
++
++/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set
++ * (the two values must either refer to video-devices associated with the same loopback device
++ *  or exactly one of them must be <0
++ */
++#define V4L2LOOPBACK_CTL_QUERY 0x4C82
++
++/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */
++#define V4L2LOOPBACK_CTL_REMOVE 0x4C81
++
++#endif /* _V4L2LOOPBACK_H */
+diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h
+new file mode 100644
+index 000000000000..d855a3796554
+--- /dev/null
++++ b/drivers/media/v4l2-core/v4l2loopback_formats.h
+@@ -0,0 +1,445 @@
++static const struct v4l2l_format formats[] = {
++#ifndef V4L2_PIX_FMT_VP9
++#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0')
++#endif
++#ifndef V4L2_PIX_FMT_HEVC
++#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C')
++#endif
++
++	/* here come the packed formats */
++	{
++		.name = "32 bpp RGB, le",
++		.fourcc = V4L2_PIX_FMT_BGR32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "32 bpp RGB, be",
++		.fourcc = V4L2_PIX_FMT_RGB32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "24 bpp RGB, le",
++		.fourcc = V4L2_PIX_FMT_BGR24,
++		.depth = 24,
++		.flags = 0,
++	},
++	{
++		.name = "24 bpp RGB, be",
++		.fourcc = V4L2_PIX_FMT_RGB24,
++		.depth = 24,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_ABGR32
++	{
++		.name = "32 bpp RGBA, le",
++		.fourcc = V4L2_PIX_FMT_ABGR32,
++		.depth = 32,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_RGBA32
++	{
++		.name = "32 bpp RGBA",
++		.fourcc = V4L2_PIX_FMT_RGBA32,
++		.depth = 32,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_RGB332
++	{
++		.name = "8 bpp RGB-3-3-2",
++		.fourcc = V4L2_PIX_FMT_RGB332,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB332 */
++#ifdef V4L2_PIX_FMT_RGB444
++	{
++		.name = "16 bpp RGB (xxxxrrrr ggggbbbb)",
++		.fourcc = V4L2_PIX_FMT_RGB444,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB444 */
++#ifdef V4L2_PIX_FMT_RGB555
++	{
++		.name = "16 bpp RGB-5-5-5",
++		.fourcc = V4L2_PIX_FMT_RGB555,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB555 */
++#ifdef V4L2_PIX_FMT_RGB565
++	{
++		.name = "16 bpp RGB-5-6-5",
++		.fourcc = V4L2_PIX_FMT_RGB565,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB565 */
++#ifdef V4L2_PIX_FMT_RGB555X
++	{
++		.name = "16 bpp RGB-5-5-5 BE",
++		.fourcc = V4L2_PIX_FMT_RGB555X,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB555X */
++#ifdef V4L2_PIX_FMT_RGB565X
++	{
++		.name = "16 bpp RGB-5-6-5 BE",
++		.fourcc = V4L2_PIX_FMT_RGB565X,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_RGB565X */
++#ifdef V4L2_PIX_FMT_BGR666
++	{
++		.name = "18 bpp BGR-6-6-6",
++		.fourcc = V4L2_PIX_FMT_BGR666,
++		.depth = 18,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_BGR666 */
++	{
++		.name = "4:2:2, packed, YUYV",
++		.fourcc = V4L2_PIX_FMT_YUYV,
++		.depth = 16,
++		.flags = 0,
++	},
++	{
++		.name = "4:2:2, packed, UYVY",
++		.fourcc = V4L2_PIX_FMT_UYVY,
++		.depth = 16,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_YVYU
++	{
++		.name = "4:2:2, packed YVYU",
++		.fourcc = V4L2_PIX_FMT_YVYU,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif
++#ifdef V4L2_PIX_FMT_VYUY
++	{
++		.name = "4:2:2, packed VYUY",
++		.fourcc = V4L2_PIX_FMT_VYUY,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif
++	{
++		.name = "4:2:2, packed YYUV",
++		.fourcc = V4L2_PIX_FMT_YYUV,
++		.depth = 16,
++		.flags = 0,
++	},
++	{
++		.name = "YUV-8-8-8-8",
++		.fourcc = V4L2_PIX_FMT_YUV32,
++		.depth = 32,
++		.flags = 0,
++	},
++	{
++		.name = "8 bpp, Greyscale",
++		.fourcc = V4L2_PIX_FMT_GREY,
++		.depth = 8,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_Y4
++	{
++		.name = "4 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y4,
++		.depth = 4,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y4 */
++#ifdef V4L2_PIX_FMT_Y6
++	{
++		.name = "6 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y6,
++		.depth = 6,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y6 */
++#ifdef V4L2_PIX_FMT_Y10
++	{
++		.name = "10 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y10,
++		.depth = 10,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y10 */
++#ifdef V4L2_PIX_FMT_Y12
++	{
++		.name = "12 bpp Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y12,
++		.depth = 12,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_Y12 */
++	{
++		.name = "16 bpp, Greyscale",
++		.fourcc = V4L2_PIX_FMT_Y16,
++		.depth = 16,
++		.flags = 0,
++	},
++#ifdef V4L2_PIX_FMT_YUV444
++	{
++		.name = "16 bpp xxxxyyyy uuuuvvvv",
++		.fourcc = V4L2_PIX_FMT_YUV444,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV444 */
++#ifdef V4L2_PIX_FMT_YUV555
++	{
++		.name = "16 bpp YUV-5-5-5",
++		.fourcc = V4L2_PIX_FMT_YUV555,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV555 */
++#ifdef V4L2_PIX_FMT_YUV565
++	{
++		.name = "16 bpp YUV-5-6-5",
++		.fourcc = V4L2_PIX_FMT_YUV565,
++		.depth = 16,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_YUV565 */
++
++/* bayer formats */
++#ifdef V4L2_PIX_FMT_SRGGB8
++	{
++		.name = "Bayer RGGB 8bit",
++		.fourcc = V4L2_PIX_FMT_SRGGB8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SRGGB8 */
++#ifdef V4L2_PIX_FMT_SGRBG8
++	{
++		.name = "Bayer GRBG 8bit",
++		.fourcc = V4L2_PIX_FMT_SGRBG8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SGRBG8 */
++#ifdef V4L2_PIX_FMT_SGBRG8
++	{
++		.name = "Bayer GBRG 8bit",
++		.fourcc = V4L2_PIX_FMT_SGBRG8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SGBRG8 */
++#ifdef V4L2_PIX_FMT_SBGGR8
++	{
++		.name = "Bayer BA81 8bit",
++		.fourcc = V4L2_PIX_FMT_SBGGR8,
++		.depth = 8,
++		.flags = 0,
++	},
++#endif /* V4L2_PIX_FMT_SBGGR8 */
++
++	/* here come the planar formats */
++	{
++		.name = "4:1:0, planar, Y-Cr-Cb",
++		.fourcc = V4L2_PIX_FMT_YVU410,
++		.depth = 9,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:2:0, planar, Y-Cr-Cb",
++		.fourcc = V4L2_PIX_FMT_YVU420,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:1:0, planar, Y-Cb-Cr",
++		.fourcc = V4L2_PIX_FMT_YUV410,
++		.depth = 9,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++	{
++		.name = "4:2:0, planar, Y-Cb-Cr",
++		.fourcc = V4L2_PIX_FMT_YUV420,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#ifdef V4L2_PIX_FMT_YUV422P
++	{
++		.name = "16 bpp YVU422 planar",
++		.fourcc = V4L2_PIX_FMT_YUV422P,
++		.depth = 16,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_YUV422P */
++#ifdef V4L2_PIX_FMT_YUV411P
++	{
++		.name = "16 bpp YVU411 planar",
++		.fourcc = V4L2_PIX_FMT_YUV411P,
++		.depth = 16,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_YUV411P */
++#ifdef V4L2_PIX_FMT_Y41P
++	{
++		.name = "12 bpp YUV 4:1:1",
++		.fourcc = V4L2_PIX_FMT_Y41P,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_Y41P */
++#ifdef V4L2_PIX_FMT_NV12
++	{
++		.name = "12 bpp Y/CbCr 4:2:0 ",
++		.fourcc = V4L2_PIX_FMT_NV12,
++		.depth = 12,
++		.flags = FORMAT_FLAGS_PLANAR,
++	},
++#endif /* V4L2_PIX_FMT_NV12 */
++
++/* here come the compressed formats */
++
++#ifdef V4L2_PIX_FMT_MJPEG
++	{
++		.name = "Motion-JPEG",
++		.fourcc = V4L2_PIX_FMT_MJPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MJPEG */
++#ifdef V4L2_PIX_FMT_JPEG
++	{
++		.name = "JFIF JPEG",
++		.fourcc = V4L2_PIX_FMT_JPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_JPEG */
++#ifdef V4L2_PIX_FMT_DV
++	{
++		.name = "DV1394",
++		.fourcc = V4L2_PIX_FMT_DV,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_DV */
++#ifdef V4L2_PIX_FMT_MPEG
++	{
++		.name = "MPEG-1/2/4 Multiplexed",
++		.fourcc = V4L2_PIX_FMT_MPEG,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG */
++#ifdef V4L2_PIX_FMT_H264
++	{
++		.name = "H264 with start codes",
++		.fourcc = V4L2_PIX_FMT_H264,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264 */
++#ifdef V4L2_PIX_FMT_H264_NO_SC
++	{
++		.name = "H264 without start codes",
++		.fourcc = V4L2_PIX_FMT_H264_NO_SC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264_NO_SC */
++#ifdef V4L2_PIX_FMT_H264_MVC
++	{
++		.name = "H264 MVC",
++		.fourcc = V4L2_PIX_FMT_H264_MVC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H264_MVC */
++#ifdef V4L2_PIX_FMT_H263
++	{
++		.name = "H263",
++		.fourcc = V4L2_PIX_FMT_H263,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_H263 */
++#ifdef V4L2_PIX_FMT_MPEG1
++	{
++		.name = "MPEG-1 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG1,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG1 */
++#ifdef V4L2_PIX_FMT_MPEG2
++	{
++		.name = "MPEG-2 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG2,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG2 */
++#ifdef V4L2_PIX_FMT_MPEG4
++	{
++		.name = "MPEG-4 part 2 ES",
++		.fourcc = V4L2_PIX_FMT_MPEG4,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_MPEG4 */
++#ifdef V4L2_PIX_FMT_XVID
++	{
++		.name = "Xvid",
++		.fourcc = V4L2_PIX_FMT_XVID,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_XVID */
++#ifdef V4L2_PIX_FMT_VC1_ANNEX_G
++	{
++		.name = "SMPTE 421M Annex G compliant stream",
++		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_G,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */
++#ifdef V4L2_PIX_FMT_VC1_ANNEX_L
++	{
++		.name = "SMPTE 421M Annex L compliant stream",
++		.fourcc = V4L2_PIX_FMT_VC1_ANNEX_L,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */
++#ifdef V4L2_PIX_FMT_VP8
++	{
++		.name = "VP8",
++		.fourcc = V4L2_PIX_FMT_VP8,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VP8 */
++#ifdef V4L2_PIX_FMT_VP9
++	{
++		.name = "VP9",
++		.fourcc = V4L2_PIX_FMT_VP9,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_VP9 */
++#ifdef V4L2_PIX_FMT_HEVC
++	{
++		.name = "HEVC",
++		.fourcc = V4L2_PIX_FMT_HEVC,
++		.depth = 32,
++		.flags = FORMAT_FLAGS_COMPRESSED,
++	},
++#endif /* V4L2_PIX_FMT_HEVC */
++};
+-- 
+2.40.0.rc2
diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch
index 9e5bc88..2ba8854 100644
--- a/patches/0003-bore.patch
+++ b/patches/0003-bore.patch
@@ -1,23 +1,22 @@
-From f169eabeb1ba8f339ab9bebec8d503c70c5f5879 Mon Sep 17 00:00:00 2001
+From e016cce088886f56617becc8fcc598a0114e4faa Mon Sep 17 00:00:00 2001
 From: Peter Jung <admin@ptr1337.dev>
-Date: Fri, 17 Feb 2023 15:39:23 +0100
-Subject: [PATCH] bore-cachy
+Date: Sat, 11 Mar 2023 18:44:19 +0100
+Subject: [PATCH] bore-eevdf
 
 Signed-off-by: Peter Jung <admin@ptr1337.dev>
 ---
- include/linux/sched.h   |   5 ++
- init/Kconfig            |  20 ++++++
- kernel/sched/core.c     |  29 +++++++++
- kernel/sched/debug.c    |   3 +
- kernel/sched/fair.c     | 132 +++++++++++++++++++++++++++++++++++++++-
- kernel/sched/features.h |   4 ++
- 6 files changed, 190 insertions(+), 3 deletions(-)
+ include/linux/sched.h |   5 ++
+ init/Kconfig          |  20 +++++++
+ kernel/sched/core.c   |  29 ++++++++++
+ kernel/sched/debug.c  |   3 +
+ kernel/sched/fair.c   | 124 +++++++++++++++++++++++++++++++++++++++++-
+ 5 files changed, 180 insertions(+), 1 deletion(-)
 
 diff --git a/include/linux/sched.h b/include/linux/sched.h
-index df219c7cd6aa..a3538eacb095 100644
+index 764df627c243..f912da35db34 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
-@@ -556,6 +556,11 @@ struct sched_entity {
+@@ -558,6 +558,11 @@ struct sched_entity {
  	u64				sum_exec_runtime;
  	u64				vruntime;
  	u64				prev_sum_exec_runtime;
@@ -26,11 +25,11 @@ index df219c7cd6aa..a3538eacb095 100644
 +	u64				burst_time;
 +	u8				burst_score;
 +#endif // CONFIG_SCHED_BORE
+ 	s64				lag;
+ 	u64				slice;
  
- 	u64				nr_migrations;
- 	u64				prev_sleep_sum_runtime;
 diff --git a/init/Kconfig b/init/Kconfig
-index 85a602dba878..bc69f062ca76 100644
+index 748a9491ca12..d10f1e6257cd 100644
 --- a/init/Kconfig
 +++ b/init/Kconfig
 @@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE
@@ -61,10 +60,10 @@ index 85a602dba878..bc69f062ca76 100644
  	bool "Automatic process group scheduling"
  	select CGROUPS
 diff --git a/kernel/sched/core.c b/kernel/sched/core.c
-index 919edb034108..fd52870a002f 100644
+index 9db5f9ec9022..1f1e1f586407 100644
 --- a/kernel/sched/core.c
 +++ b/kernel/sched/core.c
-@@ -4420,6 +4420,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
+@@ -4418,6 +4418,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
  	return try_to_wake_up(p, state, 0);
  }
  
@@ -86,16 +85,16 @@ index 919edb034108..fd52870a002f 100644
  /*
   * Perform scheduler related setup for a newly forked process p.
   * p is forked by current.
-@@ -4438,6 +4453,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+@@ -4434,6 +4449,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
  	p->se.vruntime			= 0;
- 	p->se.dur_avg			= 0;
- 	p->se.prev_sleep_sum_runtime	= 0;
 +#ifdef CONFIG_SCHED_BORE
 +	p->se.burst_time      = 0;
 +#endif // CONFIG_SCHED_BORE
- 	INIT_LIST_HEAD(&p->se.group_node);
- 	RB_CLEAR_NODE(&p->se.latency_node);
- 
+ 	p->se.dur_avg			= 0;
+ 	p->se.prev_sleep_sum_runtime	= 0;
+ 	p->se.lag			= 0;
 @@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init);
  int sched_fork(unsigned long clone_flags, struct task_struct *p)
  {
@@ -107,7 +106,7 @@ index 919edb034108..fd52870a002f 100644
  	/*
  	 * We mark the process as NEW here. This guarantees that
  	 * nobody will actually run it, and a signal or other external
-@@ -9154,6 +9176,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
+@@ -9153,6 +9175,9 @@ void __init init_idle(struct task_struct *idle, int cpu)
  
  	idle->__state = TASK_RUNNING;
  	idle->se.exec_start = sched_clock();
@@ -117,22 +116,22 @@ index 919edb034108..fd52870a002f 100644
  	/*
  	 * PF_KTHREAD should already be set at this point; regardless, make it
  	 * look like a proper per-CPU kthread.
-@@ -9821,6 +9846,10 @@ void __init sched_init(void)
+@@ -9820,6 +9845,10 @@ void __init sched_init(void)
  	BUG_ON(&dl_sched_class != &stop_sched_class + 1);
  #endif
  
 +#ifdef CONFIG_SCHED_BORE
-+	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 1.7.10 by Masahito Suzuki");
++	printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification for 1.7-eevdf2 by Masahito Suzuki");
 +#endif // CONFIG_SCHED_BORE
 +
  	wait_bit_init();
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
 diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
-index 177934290ec4..2f40a238cdad 100644
+index fe9edfa43f65..3672df7c1f6a 100644
 --- a/kernel/sched/debug.c
 +++ b/kernel/sched/debug.c
-@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+@@ -551,6 +551,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)),
  		SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime)));
  
@@ -143,7 +142,7 @@ index 177934290ec4..2f40a238cdad 100644
  	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
  #endif
 diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
-index 5ef893ce5734..590adb9a3e37 100644
+index c40b775452bc..1e4ca5419a11 100644
 --- a/kernel/sched/fair.c
 +++ b/kernel/sched/fair.c
 @@ -19,6 +19,9 @@
@@ -156,14 +155,14 @@ index 5ef893ce5734..590adb9a3e37 100644
   */
  #include <linux/energy_model.h>
  #include <linux/mmap_lock.h>
-@@ -140,6 +143,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
+@@ -141,6 +144,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
  
 +#ifdef CONFIG_SCHED_BORE
 +unsigned int __read_mostly sched_bore                = 1;
 +unsigned int __read_mostly sched_burst_penalty_scale = 1280;
-+unsigned int __read_mostly sched_burst_granularity   = 12;
++unsigned int __read_mostly sched_burst_granularity   = 6;
 +unsigned int __read_mostly sched_burst_smoothness    = 2;
 +static int three          = 3;
 +static int sixty_four     = 64;
@@ -173,7 +172,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  int sched_thermal_decay_shift;
  static int __init setup_sched_thermal_decay_shift(char *str)
  {
-@@ -203,6 +216,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
+@@ -204,6 +217,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536;
  
  #ifdef CONFIG_SYSCTL
  static struct ctl_table sched_fair_sysctls[] = {
@@ -185,7 +184,7 @@ index 5ef893ce5734..590adb9a3e37 100644
 +		.mode		= 0644,
 +		.proc_handler	= &proc_dointvec_minmax,
 +		.extra1		= SYSCTL_ZERO,
-+		.extra2		= &three,
++		.extra2		= SYSCTL_ONE,
 +	},
 +	{
 +		.procname	= "sched_burst_penalty_scale",
@@ -218,7 +217,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  	{
  		.procname       = "sched_child_runs_first",
  		.data           = &sysctl_sched_child_runs_first,
-@@ -978,6 +1029,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
+@@ -1182,6 +1233,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq)
  }
  #endif /* CONFIG_SMP */
  
@@ -258,21 +257,21 @@ index 5ef893ce5734..590adb9a3e37 100644
  /*
   * Update the current task's runtime statistics.
   */
-@@ -1007,6 +1091,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+@@ -1211,6 +1295,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
  	curr->sum_exec_runtime += delta_exec;
  	schedstat_add(cfs_rq->exec_clock, delta_exec);
  
 +#ifdef CONFIG_SCHED_BORE
 +	curr->burst_time += delta_exec;
 +	update_burst_score(curr);
-+	if (sched_bore & 1)
++	if (sched_bore)
 +		curr->vruntime += calc_delta_fair_bscale(delta_exec, curr);
 +	else
 +#endif // CONFIG_SCHED_BORE
  	curr->vruntime += calc_delta_fair(delta_exec, curr);
- 	update_min_vruntime(cfs_rq);
- 
-@@ -5057,6 +5148,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	/*
+ 	 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
+@@ -5283,6 +5374,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
  	se->prev_sum_exec_runtime = se->sum_exec_runtime;
  }
  
@@ -284,7 +283,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  static int
  wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
  
-@@ -5101,7 +5197,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+@@ -5330,7 +5426,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
  			se = second;
  	}
  
@@ -299,7 +298,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  		/*
  		 * Someone really wants this to run. If it's not unfair, run it.
  		 */
-@@ -6394,6 +6496,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
+@@ -6615,6 +6717,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  	util_est_dequeue(&rq->cfs, p);
  
  	for_each_sched_entity(se) {
@@ -309,7 +308,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  		cfs_rq = cfs_rq_of(se);
  		dequeue_entity(cfs_rq, se, flags);
  
-@@ -7856,7 +7961,12 @@ static unsigned long wakeup_gran(struct sched_entity *se)
+@@ -8070,7 +8175,12 @@ static unsigned long wakeup_gran(struct sched_entity *se)
   *
   */
  static int
@@ -321,12 +320,11 @@ index 5ef893ce5734..590adb9a3e37 100644
 +#endif // CONFIG_SCHED_BORE
  {
  	s64 gran, vdiff = curr->vruntime - se->vruntime;
- 	s64 offset = wakeup_latency_gran(curr, se);
-@@ -7876,12 +7986,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
- 	 * chance to preempt current.
- 	 */
- 	gran = min_t(s64, gran, get_latency_max());
--
+ 
+@@ -8078,11 +8188,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
+ 		return -1;
+ 
+ 	gran = wakeup_gran(se);
 +#ifdef CONFIG_SCHED_BORE
 +	if (do_scale) gran = burst_scale(gran, se);
 +#endif // CONFIG_SCHED_BORE
@@ -344,21 +342,7 @@ index 5ef893ce5734..590adb9a3e37 100644
  
  static void set_last_buddy(struct sched_entity *se)
  {
-@@ -7981,7 +8099,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
- 		return;
- 
- 	update_curr(cfs_rq_of(se));
--	if (wakeup_preempt_entity(se, pse) == 1) {
-+#ifdef CONFIG_SCHED_BORE
-+	if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1)
-+#else // CONFIG_SCHED_BORE
-+	if (wakeup_preempt_entity(se, pse) == 1)
-+#endif // CONFIG_SCHED_BORE
-+	{
- 		/*
- 		 * Bias pick_next to pick the sched entity that is
- 		 * triggering this preemption.
-@@ -8217,6 +8340,9 @@ static void yield_task_fair(struct rq *rq)
+@@ -8430,6 +8549,9 @@ static void yield_task_fair(struct rq *rq)
  	struct task_struct *curr = rq->curr;
  	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
  	struct sched_entity *se = &curr->se;
@@ -368,21 +352,5 @@ index 5ef893ce5734..590adb9a3e37 100644
  
  	/*
  	 * Are we the only task in the tree?
-diff --git a/kernel/sched/features.h b/kernel/sched/features.h
-index efdc29c42161..0f28637ce1aa 100644
---- a/kernel/sched/features.h
-+++ b/kernel/sched/features.h
-@@ -4,7 +4,11 @@
-  * them to run sooner, but does not allow tons of sleepers to
-  * rip the spread apart.
-  */
-+#ifdef CONFIG_SCHED_BORE
-+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false)
-+#else // CONFIG_SCHED_BORE
- SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
-+#endif // CONFIG_SCHED_BORE
- 
- /*
-  * Place new tasks ahead so that they do not starve already running
 -- 
-2.39.2
+2.40.0.rc2
diff --git a/patches/0004-eevdf.patch b/patches/0004-eevdf.patch
new file mode 100644
index 0000000..11213cb
--- /dev/null
+++ b/patches/0004-eevdf.patch
@@ -0,0 +1,1326 @@
+From b6d3ec3be2639fe928a09b558e979c36b41ea63b Mon Sep 17 00:00:00 2001
+From: Peter Jung <admin@ptr1337.dev>
+Date: Sat, 11 Mar 2023 18:42:39 +0100
+Subject: [PATCH] EEVDF
+
+Ever since looking at the latency-nice patches, I've wondered if EEVDF would
+not make more sense, and I did point Vincent at some older patches I had for
+that (which is here his augmented rbtree thing comes from).
+
+Also, since I really dislike the dual tree, I also figured we could dynamically
+switch between an augmented tree and not (and while I have code for that,
+that's not included in this posting because with the current results I don't
+think we actually need this).
+
+Anyway, since I'm somewhat under the weather, I spend last week desperately
+trying to connect a small cluster of neurons in defiance of the snot overlord
+and bring back the EEVDF patches from the dark crypts where they'd been
+gathering cobwebs for the past 13 odd years.
+
+By friday they worked well enough, and this morning (because obviously I forgot
+the weekend is ideal to run benchmarks) I ran a bunch of hackbenck, netperf,
+tbench and sysbench -- there's a bunch of wins and losses, but nothing that
+indicates a total fail.
+
+( in fact, some of the schbench results seem to indicate EEVDF schedules a lot
+  more consistent than CFS and has a bunch of latency wins )
+
+( hackbench also doesn't show the augmented tree and generally more expensive
+  pick to be a loss, in fact it shows a slight win here )
+
+  hackbech load + cyclictest --policy other results:
+
+			EEVDF			 CFS
+
+		# Min Latencies: 00053
+  LNICE(19)	# Avg Latencies: 04350
+		# Max Latencies: 76019
+
+		# Min Latencies: 00052		00053
+  LNICE(0)	# Avg Latencies: 00690		00687
+		# Max Latencies: 14145		13913
+
+		# Min Latencies: 00019
+  LNICE(-19)	# Avg Latencies: 00261
+		# Max Latencies: 05642
+
+The nice -19 numbers aren't as pretty as Vincent's, but at the end I was going
+cross-eyed from staring at tree prints and I just couldn't figure out where it
+was going side-ways.
+
+There's definitely more benchmarking/tweaking to be done (0-day already
+reported a stress-ng loss), but if we can pull this off we can delete a whole
+much of icky heuristics code. EEVDF is a much better defined policy than what
+we currently have.
+
+Signed-off-by: Peter Jung <admin@ptr1337.dev>
+---
+ Documentation/admin-guide/cgroup-v2.rst |  10 +
+ include/linux/rbtree_augmented.h        |  26 ++
+ include/linux/sched.h                   |   8 +
+ include/linux/sched/prio.h              |  27 ++
+ include/uapi/linux/sched.h              |   4 +-
+ include/uapi/linux/sched/types.h        |  19 +
+ init/init_task.c                        |   1 +
+ kernel/sched/core.c                     |  66 ++++
+ kernel/sched/debug.c                    |  39 +-
+ kernel/sched/fair.c                     | 486 ++++++++++++++++++++----
+ kernel/sched/features.h                 |  10 +-
+ kernel/sched/sched.h                    |  12 +
+ tools/include/uapi/linux/sched.h        |   4 +-
+ 13 files changed, 614 insertions(+), 98 deletions(-)
+
+diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
+index 74cec76be9f2..2e511d4a4c6a 100644
+--- a/Documentation/admin-guide/cgroup-v2.rst
++++ b/Documentation/admin-guide/cgroup-v2.rst
+@@ -1118,6 +1118,16 @@ All time durations are in microseconds.
+         values similar to the sched_setattr(2). This maximum utilization
+         value is used to clamp the task specific maximum utilization clamp.
+ 
++  cpu.latency.nice
++	A read-write single value file which exists on non-root
++	cgroups.  The default is "0".
++
++	The nice value is in the range [-20, 19].
++
++	This interface file allows reading and setting latency using the
++	same values used by sched_setattr(2). The latency_nice of a group is
++	used to limit the impact of the latency_nice of a task outside the
++	group.
+ 
+ 
+ Memory
+diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h
+index d1c53e9d8c75..a78e692a9ff5 100644
+--- a/include/linux/rbtree_augmented.h
++++ b/include/linux/rbtree_augmented.h
+@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node,
+ 	rb_insert_augmented(node, &root->rb_root, augment);
+ }
+ 
++static __always_inline struct rb_node *
++rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree,
++			bool (*less)(struct rb_node *, const struct rb_node *),
++			const struct rb_augment_callbacks *augment)
++{
++	struct rb_node **link = &tree->rb_root.rb_node;
++	struct rb_node *parent = NULL;
++	bool leftmost = true;
++
++	while (*link) {
++		parent = *link;
++		if (less(node, parent)) {
++			link = &parent->rb_left;
++		} else {
++			link = &parent->rb_right;
++			leftmost = false;
++		}
++	}
++
++	rb_link_node(node, parent, link);
++	augment->propagate(parent, NULL); /* suboptimal */
++	rb_insert_augmented_cached(node, tree, leftmost, augment);
++
++	return leftmost ? node : NULL;
++}
++
+ /*
+  * Template for declaring augmented rbtree callbacks (generic case)
+  *
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 28ce1be0ba47..764df627c243 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -548,6 +548,9 @@ struct sched_entity {
+ 	/* For load-balancing: */
+ 	struct load_weight		load;
+ 	struct rb_node			run_node;
++	u64				deadline;
++	u64				min_deadline;
++
+ 	struct list_head		group_node;
+ 	unsigned int			on_rq;
+ 
+@@ -555,6 +558,8 @@ struct sched_entity {
+ 	u64				sum_exec_runtime;
+ 	u64				vruntime;
+ 	u64				prev_sum_exec_runtime;
++	s64				lag;
++	u64				slice;
+ 
+ 	u64				nr_migrations;
+ 	u64				prev_sleep_sum_runtime;
+@@ -571,6 +576,8 @@ struct sched_entity {
+ 	/* cached value of my_q->h_nr_running */
+ 	unsigned long			runnable_weight;
+ #endif
++	/* preemption offset in ns */
++	long				latency_offset;
+ 
+ #ifdef CONFIG_SMP
+ 	/*
+@@ -787,6 +794,7 @@ struct task_struct {
+ 	int				static_prio;
+ 	int				normal_prio;
+ 	unsigned int			rt_priority;
++	int				latency_prio;
+ 
+ 	struct sched_entity		se;
+ 	struct sched_rt_entity		rt;
+diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
+index ab83d85e1183..be79503d86af 100644
+--- a/include/linux/sched/prio.h
++++ b/include/linux/sched/prio.h
+@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio)
+ 	return (MAX_NICE - prio + 1);
+ }
+ 
++/*
++ * Latency nice is meant to provide scheduler hints about the relative
++ * latency requirements of a task with respect to other tasks.
++ * Thus a task with latency_nice == 19 can be hinted as the task with no
++ * latency requirements, in contrast to the task with latency_nice == -20
++ * which should be given priority in terms of lower latency.
++ */
++#define MAX_LATENCY_NICE	19
++#define MIN_LATENCY_NICE	-20
++
++#define LATENCY_NICE_WIDTH	\
++	(MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1)
++
++/*
++ * Default tasks should be treated as a task with latency_nice = 0.
++ */
++#define DEFAULT_LATENCY_NICE	0
++#define DEFAULT_LATENCY_PRIO	(DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2)
++
++/*
++ * Convert user-nice values [ -20 ... 0 ... 19 ]
++ * to static latency [ 0..39 ],
++ * and back.
++ */
++#define NICE_TO_LATENCY(nice)	((nice) + DEFAULT_LATENCY_PRIO)
++#define LATENCY_TO_NICE(prio)	((prio) - DEFAULT_LATENCY_PRIO)
++
+ #endif /* _LINUX_SCHED_PRIO_H */
+diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
+index 3bac0a8ceab2..b2e932c25be6 100644
+--- a/include/uapi/linux/sched.h
++++ b/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+ 
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+ 
+ #endif /* _UAPI_LINUX_SCHED_H */
+diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
+index f2c4589d4dbf..db1e8199e8c8 100644
+--- a/include/uapi/linux/sched/types.h
++++ b/include/uapi/linux/sched/types.h
+@@ -10,6 +10,7 @@ struct sched_param {
+ 
+ #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
+ #define SCHED_ATTR_SIZE_VER1	56	/* add: util_{min,max} */
++#define SCHED_ATTR_SIZE_VER2	60	/* add: latency_nice */
+ 
+ /*
+  * Extended scheduling parameters data structure.
+@@ -98,6 +99,22 @@ struct sched_param {
+  * scheduled on a CPU with no more capacity than the specified value.
+  *
+  * A task utilization boundary can be reset by setting the attribute to -1.
++ *
++ * Latency Tolerance Attributes
++ * ===========================
++ *
++ * A subset of sched_attr attributes allows to specify the relative latency
++ * requirements of a task with respect to the other tasks running/queued in the
++ * system.
++ *
++ * @ sched_latency_nice	task's latency_nice value
++ *
++ * The latency_nice of a task can have any value in a range of
++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE].
++ *
++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be
++ * taken for a task requiring a lower latency as opposed to the task with
++ * higher latency_nice.
+  */
+ struct sched_attr {
+ 	__u32 size;
+@@ -120,6 +137,8 @@ struct sched_attr {
+ 	__u32 sched_util_min;
+ 	__u32 sched_util_max;
+ 
++	/* latency requirement hints */
++	__s32 sched_latency_nice;
+ };
+ 
+ #endif /* _UAPI_LINUX_SCHED_TYPES_H */
+diff --git a/init/init_task.c b/init/init_task.c
+index ff6c4b9bfe6b..071deff8dbd1 100644
+--- a/init/init_task.c
++++ b/init/init_task.c
+@@ -78,6 +78,7 @@ struct task_struct init_task
+ 	.prio		= MAX_PRIO - 20,
+ 	.static_prio	= MAX_PRIO - 20,
+ 	.normal_prio	= MAX_PRIO - 20,
++	.latency_prio	= DEFAULT_LATENCY_PRIO,
+ 	.policy		= SCHED_NORMAL,
+ 	.cpus_ptr	= &init_task.cpus_mask,
+ 	.user_cpus_ptr	= NULL,
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 5237639786b7..9db5f9ec9022 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1283,6 +1283,11 @@ static void set_load_weight(struct task_struct *p, bool update_load)
+ 	}
+ }
+ 
++static void set_latency_offset(struct task_struct *p)
++{
++	p->se.latency_offset = calc_latency_offset(p->latency_prio);
++}
++
+ #ifdef CONFIG_UCLAMP_TASK
+ /*
+  * Serializes updates of utilization clamp values
+@@ -4431,8 +4436,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.vruntime			= 0;
+ 	p->se.dur_avg			= 0;
+ 	p->se.prev_sleep_sum_runtime	= 0;
++	p->se.lag			= 0;
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
++	set_latency_offset(p);
++
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ 	p->se.cfs_rq			= NULL;
+ #endif
+@@ -4684,6 +4692,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 		p->prio = p->normal_prio = p->static_prio;
+ 		set_load_weight(p, false);
+ 
++		p->latency_prio = NICE_TO_LATENCY(0);
++		set_latency_offset(p);
++
+ 		/*
+ 		 * We don't need the reset flag anymore after the fork. It has
+ 		 * fulfilled its duty:
+@@ -7446,6 +7457,15 @@ static void __setscheduler_params(struct task_struct *p,
+ 	set_load_weight(p, true);
+ }
+ 
++static void __setscheduler_latency(struct task_struct *p,
++		const struct sched_attr *attr)
++{
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
++		p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
++		set_latency_offset(p);
++	}
++}
++
+ /*
+  * Check the target process has a UID that matches the current process's:
+  */
+@@ -7586,6 +7606,13 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			return retval;
+ 	}
+ 
++	if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
++		if (attr->sched_latency_nice > MAX_LATENCY_NICE)
++			return -EINVAL;
++		if (attr->sched_latency_nice < MIN_LATENCY_NICE)
++			return -EINVAL;
++	}
++
+ 	if (pi)
+ 		cpuset_read_lock();
+ 
+@@ -7620,6 +7647,9 @@ static int __sched_setscheduler(struct task_struct *p,
+ 			goto change;
+ 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+ 			goto change;
++		if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE &&
++		    attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))
++			goto change;
+ 
+ 		p->sched_reset_on_fork = reset_on_fork;
+ 		retval = 0;
+@@ -7708,6 +7738,7 @@ static int __sched_setscheduler(struct task_struct *p,
+ 		__setscheduler_params(p, attr);
+ 		__setscheduler_prio(p, newprio);
+ 	}
++	__setscheduler_latency(p, attr);
+ 	__setscheduler_uclamp(p, attr);
+ 
+ 	if (queued) {
+@@ -7918,6 +7949,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
+ 	    size < SCHED_ATTR_SIZE_VER1)
+ 		return -EINVAL;
+ 
++	if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
++	    size < SCHED_ATTR_SIZE_VER2)
++		return -EINVAL;
+ 	/*
+ 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
+ 	 * to be strict and return an error on out-of-bounds values?
+@@ -8155,6 +8189,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+ 	get_params(p, &kattr);
+ 	kattr.sched_flags &= SCHED_FLAG_ALL;
+ 
++	kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
++
+ #ifdef CONFIG_UCLAMP_TASK
+ 	/*
+ 	 * This could race with another potential updater, but this is fine
+@@ -11027,6 +11063,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ {
+ 	return sched_group_set_idle(css_tg(css), idle);
+ }
++
++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css,
++				    struct cftype *cft)
++{
++	return LATENCY_TO_NICE(css_tg(css)->latency_prio);
++}
++
++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css,
++				     struct cftype *cft, s64 nice)
++{
++	int prio;
++
++	if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE)
++		return -ERANGE;
++
++	prio = NICE_TO_LATENCY(nice);
++
++	return sched_group_set_latency(css_tg(css), prio);
++}
+ #endif
+ 
+ static struct cftype cpu_legacy_files[] = {
+@@ -11041,6 +11096,11 @@ static struct cftype cpu_legacy_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+@@ -11258,6 +11318,12 @@ static struct cftype cpu_files[] = {
+ 		.read_s64 = cpu_idle_read_s64,
+ 		.write_s64 = cpu_idle_write_s64,
+ 	},
++	{
++		.name = "latency.nice",
++		.flags = CFTYPE_NOT_ON_ROOT,
++		.read_s64 = cpu_latency_nice_read_s64,
++		.write_s64 = cpu_latency_nice_write_s64,
++	},
+ #endif
+ #ifdef CONFIG_CFS_BANDWIDTH
+ 	{
+diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
+index 8d64fba16cfe..fe9edfa43f65 100644
+--- a/kernel/sched/debug.c
++++ b/kernel/sched/debug.c
+@@ -535,9 +535,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
+ 	else
+ 		SEQ_printf(m, " %c", task_state_to_char(p));
+ 
+-	SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ",
++	SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ",
+ 		p->comm, task_pid_nr(p),
+ 		SPLIT_NS(p->se.vruntime),
++		entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N',
++		SPLIT_NS(p->se.deadline),
++		SPLIT_NS(p->se.slice),
++		SPLIT_NS(p->se.sum_exec_runtime),
+ 		(long long)(p->nvcsw + p->nivcsw),
+ 		p->prio);
+ 
+@@ -580,10 +584,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
+ 
+ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ {
+-	s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
+-		spread, rq0_min_vruntime, spread0;
++	s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread;
++	struct sched_entity *last, *first;
+ 	struct rq *rq = cpu_rq(cpu);
+-	struct sched_entity *last;
+ 	unsigned long flags;
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+@@ -597,26 +600,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
+ 			SPLIT_NS(cfs_rq->exec_clock));
+ 
+ 	raw_spin_rq_lock_irqsave(rq, flags);
+-	if (rb_first_cached(&cfs_rq->tasks_timeline))
+-		MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
++	first = __pick_first_entity(cfs_rq);
++	if (first)
++		left_vruntime = first->vruntime;
+ 	last = __pick_last_entity(cfs_rq);
+ 	if (last)
+-		max_vruntime = last->vruntime;
++		right_vruntime = last->vruntime;
+ 	min_vruntime = cfs_rq->min_vruntime;
+-	rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
+ 	raw_spin_rq_unlock_irqrestore(rq, flags);
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "MIN_vruntime",
+-			SPLIT_NS(MIN_vruntime));
++
++	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "left_vruntime",
++			SPLIT_NS(left_vruntime));
+ 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "min_vruntime",
+ 			SPLIT_NS(min_vruntime));
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "max_vruntime",
+-			SPLIT_NS(max_vruntime));
+-	spread = max_vruntime - MIN_vruntime;
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread",
+-			SPLIT_NS(spread));
+-	spread0 = min_vruntime - rq0_min_vruntime;
+-	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread0",
+-			SPLIT_NS(spread0));
++	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "avg_vruntime",
++			SPLIT_NS(avg_vruntime(cfs_rq)));
++	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "right_vruntime",
++			SPLIT_NS(right_vruntime));
++	spread = right_vruntime - left_vruntime;
++	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread));
+ 	SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
+ 			cfs_rq->nr_spread_over);
+ 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+@@ -1044,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ #endif
+ 	P(policy);
+ 	P(prio);
++	P(latency_prio);
+ 	if (task_has_dl_policy(p)) {
+ 		P(dl.runtime);
+ 		P(dl.deadline);
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 84254f52c56a..c40b775452bc 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -47,6 +47,7 @@
+ #include <linux/psi.h>
+ #include <linux/ratelimit.h>
+ #include <linux/task_work.h>
++#include <linux/rbtree_augmented.h>
+ 
+ #include <asm/switch_to.h>
+ 
+@@ -619,13 +620,134 @@ static inline bool entity_before(struct sched_entity *a,
+ 	return (s64)(a->vruntime - b->vruntime) < 0;
+ }
+ 
++static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	return (s64)(se->vruntime - cfs_rq->min_vruntime);
++}
++
+ #define __node_2_se(node) \
+ 	rb_entry((node), struct sched_entity, run_node)
+ 
++/*
++ * Compute virtual time from the per-task service numbers:
++ *
++ * Fair schedulers conserve lag: \Sum lag_i = 0
++ *
++ * lag_i = S - s_i = w_i * (V - v_i)
++ *
++ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0
++ *
++ * From which we solve V:
++ *
++ *     \Sum v_i * w_i
++ * V = --------------
++ *        \Sum w_i
++ *
++ * However, since v_i is u64, and the multiplcation could easily overflow
++ * transform it into a relative form that uses smaller quantities:
++ *
++ * Substitute: v_i == (v_i - v) + v
++ *
++ *     \Sum ((v_i - v) + v) * w_i   \Sum (v_i - v) * w_i
++ * V = -------------------------- = -------------------- + v
++ *              \Sum w_i                   \Sum w_i
++ *
++ * min_vruntime = v
++ * avg_vruntime = \Sum (v_i - v) * w_i
++ * cfs_rq->load = \Sum w_i
++ *
++ * Since min_vruntime is a monotonic increasing variable that closely tracks
++ * the per-task service, these deltas: (v_i - v), will be in the order of the
++ * maximal (virtual) lag induced in the system due to quantisation.
++ */
++static void
++avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	s64 key = entity_key(cfs_rq, se);
++	cfs_rq->avg_vruntime += key * se->load.weight;
++	cfs_rq->avg_load += se->load.weight;
++}
++
++static void
++avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	s64 key = entity_key(cfs_rq, se);
++	cfs_rq->avg_vruntime -= key * se->load.weight;
++	cfs_rq->avg_load -= se->load.weight;
++}
++
++static inline
++void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta)
++{
++	/*
++	 * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load
++	 */
++	cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta;
++}
++
++u64 avg_vruntime(struct cfs_rq *cfs_rq)
++{
++	struct sched_entity *curr = cfs_rq->curr;
++	s64 lag = cfs_rq->avg_vruntime;
++	long load = cfs_rq->avg_load;
++
++	if (curr && curr->on_rq) {
++		lag += entity_key(cfs_rq, curr) * curr->load.weight;
++		load += curr->load.weight;
++	}
++
++	if (load)
++		lag = div_s64(lag, load);
++
++	return cfs_rq->min_vruntime + lag;
++}
++
++/*
++ * Entity is eligible once it received less service than it ought to have,
++ * eg. lag >= 0.
++ *
++ * lag_i = S - s_i = w_i*(V - w_i)
++ *
++ * lag_i >= 0 -> V >= v_i
++ *
++ *     \Sum (v_i - v)*w_i
++ * V = ------------------ + v
++ *          \Sum w_i
++ *
++ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i)
++ */
++int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se)
++{
++	struct sched_entity *curr = cfs_rq->curr;
++	s64 avg_vruntime = cfs_rq->avg_vruntime;
++	long avg_load = cfs_rq->avg_load;
++
++	if (curr && curr->on_rq) {
++		avg_vruntime += entity_key(cfs_rq, curr) * curr->load.weight;
++		avg_load += curr->load.weight;
++	}
++
++	return avg_vruntime >= entity_key(cfs_rq, se) * avg_load;
++}
++
++static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime)
++{
++	u64 min_vruntime = cfs_rq->min_vruntime;
++	/*
++	 * open coded max_vruntime() to allow updating avg_vruntime
++	 */
++	s64 delta = (s64)(vruntime - min_vruntime);
++	if (delta > 0) {
++		avg_vruntime_update(cfs_rq, delta);
++		min_vruntime = vruntime;
++	}
++	return min_vruntime;
++}
++
+ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ {
++	struct sched_entity *se = __pick_first_entity(cfs_rq);
+ 	struct sched_entity *curr = cfs_rq->curr;
+-	struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
+ 
+ 	u64 vruntime = cfs_rq->min_vruntime;
+ 
+@@ -636,9 +758,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ 			curr = NULL;
+ 	}
+ 
+-	if (leftmost) { /* non-empty tree */
+-		struct sched_entity *se = __node_2_se(leftmost);
+-
++	if (se) {
+ 		if (!curr)
+ 			vruntime = se->vruntime;
+ 		else
+@@ -647,7 +767,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
+ 
+ 	/* ensure we never gain time by being placed backwards. */
+ 	u64_u32_store(cfs_rq->min_vruntime,
+-		      max_vruntime(cfs_rq->min_vruntime, vruntime));
++		      __update_min_vruntime(cfs_rq, vruntime));
+ }
+ 
+ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+@@ -655,17 +775,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
+ 	return entity_before(__node_2_se(a), __node_2_se(b));
+ }
+ 
++#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; })
++
++static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node)
++{
++	if (node) {
++		struct sched_entity *rse = __node_2_se(node);
++		if (deadline_gt(min_deadline, se, rse))
++			se->min_deadline = rse->min_deadline;
++	}
++}
++
++/*
++ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline)
++ */
++static inline bool min_deadline_update(struct sched_entity *se, bool exit)
++{
++	u64 old_min_deadline = se->min_deadline;
++	struct rb_node *node = &se->run_node;
++
++	se->min_deadline = se->deadline;
++	__update_min_deadline(se, node->rb_right);
++	__update_min_deadline(se, node->rb_left);
++
++	return se->min_deadline == old_min_deadline;
++}
++
++RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity,
++		     run_node, min_deadline, min_deadline_update);
++
+ /*
+  * Enqueue an entity into the rb-tree:
+  */
+ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
++	avg_vruntime_add(cfs_rq, se);
++	se->min_deadline = se->deadline;
++	rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
++				__entity_less, &min_deadline_cb);
+ }
+ 
+ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
++	rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline,
++				  &min_deadline_cb);
++	avg_vruntime_sub(cfs_rq, se);
+ }
+ 
+ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+@@ -688,6 +842,101 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+ 	return __node_2_se(next);
+ }
+ 
++static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr)
++{
++	struct sched_entity *left = __pick_first_entity(cfs_rq);
++
++	/*
++	 * If curr is set we have to see if its left of the leftmost entity
++	 * still in the tree, provided there was anything in the tree at all.
++	 */
++	if (!left || (curr && entity_before(curr, left)))
++		left = curr;
++
++	return left;
++}
++
++/*
++ * Earliest Eligible Virtual Deadline First
++ *
++ * In order to provide latency guarantees for different request sizes
++ * EEVDF selects the best runnable task from two criteria:
++ *
++ *  1) the task must be eligible (must be owed service)
++ *
++ *  2) from those tasks that meet 1), we select the one
++ *     with the earliest virtual deadline.
++ *
++ * We can do this in O(log n) time due to an augmented RB-tree. The
++ * tree keeps the entries sorted on service, but also functions as a
++ * heap based on the deadline by keeping:
++ *
++ *  se->min_deadline = min(se->deadline, se->{left,right}->min_deadline)
++ *
++ * Which allows an EDF like search on (sub)trees.
++ */
++static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq)
++{
++	struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node;
++	struct sched_entity *curr = cfs_rq->curr;
++	struct sched_entity *best = NULL;
++
++	if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr)))
++		curr = NULL;
++
++	while (node) {
++		struct sched_entity *se = __node_2_se(node);
++
++		/*
++		 * If this entity is not eligible, try the left subtree.
++		 *
++		 * XXX: would it be worth it to do the single division for
++		 *      avg_vruntime() once, instead of the multiplication
++		 *      in entity_eligible() O(log n) times?
++		 */
++		if (!entity_eligible(cfs_rq, se)) {
++			node = node->rb_left;
++			continue;
++		}
++
++		/*
++		 * If this entity has an earlier deadline than the previous
++		 * best, take this one. If it also has the earliest deadline
++		 * of its subtree, we're done.
++		 */
++		if (!best || deadline_gt(deadline, best, se)) {
++			best = se;
++			if (best->deadline == best->min_deadline)
++				break;
++		}
++
++		/*
++		 * If the earlest deadline in this subtree is in the fully
++		 * eligible left half of our space, go there.
++		 */
++		if (node->rb_left &&
++		    __node_2_se(node->rb_left)->min_deadline == se->min_deadline) {
++			node = node->rb_left;
++			continue;
++		}
++
++		node = node->rb_right;
++	}
++
++	if (!best || (curr && deadline_gt(deadline, best, curr)))
++		best = curr;
++
++	if (unlikely(!best)) {
++		struct sched_entity *left = __pick_first_entity(cfs_rq);
++		if (left) {
++			pr_err("EEVDF scheduling fail, picking leftmost\n");
++			return left;
++		}
++	}
++
++	return best;
++}
++
+ #ifdef CONFIG_SCHED_DEBUG
+ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+ {
+@@ -721,6 +970,14 @@ int sched_update_scaling(void)
+ }
+ #endif
+ 
++long calc_latency_offset(int prio)
++{
++	u32 weight = sched_prio_to_weight[prio];
++	u64 base = sysctl_sched_min_granularity;
++
++	return div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight);
++}
++
+ /*
+  * delta /= w
+  */
+@@ -797,14 +1054,30 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ 	return slice;
+ }
+ 
+-/*
+- * We calculate the vruntime slice of a to-be-inserted task.
+- *
+- * vs = s/w
+- */
+-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
++static void set_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ {
+-	return calc_delta_fair(sched_slice(cfs_rq, se), se);
++	if (sched_feat(EEVDF)) {
++		/*
++		 * For EEVDF the virtual time slope is determined by w_i (iow.
++		 * nice) while the request time r_i is determined by
++		 * latency-nice.
++		 */
++		se->slice = se->latency_offset;
++	} else {
++		/*
++		 * When many tasks blow up the sched_period; it is possible
++		 * that sched_slice() reports unusually large results (when
++		 * many tasks are very light for example). Therefore impose a
++		 * maximum.
++		 */
++		se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency);
++	}
++
++	/*
++	 * vd_i = ve_i + r_i / w_i
++	 */
++	se->deadline = se->vruntime + calc_delta_fair(se->slice, se);
++	se->min_deadline = se->deadline;
+ }
+ 
+ #include "pelt.h"
+@@ -939,6 +1212,13 @@ static void update_curr(struct cfs_rq *cfs_rq)
+ 	schedstat_add(cfs_rq->exec_clock, delta_exec);
+ 
+ 	curr->vruntime += calc_delta_fair(delta_exec, curr);
++	/*
++	 * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i
++	 * this is probably good enough.
++	 */
++	if ((s64)(curr->vruntime - curr->deadline) > 0)
++		set_slice(cfs_rq, curr);
++
+ 	update_min_vruntime(cfs_rq);
+ 
+ 	if (entity_is_task(curr)) {
+@@ -3340,6 +3620,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ 		/* commit outstanding execution time */
+ 		if (cfs_rq->curr == se)
+ 			update_curr(cfs_rq);
++		else
++			avg_vruntime_sub(cfs_rq, se);
+ 		update_load_sub(&cfs_rq->load, se->load.weight);
+ 	}
+ 	dequeue_load_avg(cfs_rq, se);
+@@ -3355,9 +3637,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ #endif
+ 
+ 	enqueue_load_avg(cfs_rq, se);
+-	if (se->on_rq)
++	if (se->on_rq) {
+ 		update_load_add(&cfs_rq->load, se->load.weight);
+-
++		if (cfs_rq->curr != se)
++			avg_vruntime_add(cfs_rq, se);
++	}
+ }
+ 
+ void reweight_task(struct task_struct *p, int prio)
+@@ -4669,49 +4953,49 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
+ static void
+ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
+ {
+-	u64 vruntime = cfs_rq->min_vruntime;
+-	u64 sleep_time;
++	u64 vruntime = avg_vruntime(cfs_rq);
+ 
+-	/*
+-	 * The 'current' period is already promised to the current tasks,
+-	 * however the extra weight of the new task will slow them down a
+-	 * little, place the new task so that it fits in the slot that
+-	 * stays open at the end.
+-	 */
+-	if (initial && sched_feat(START_DEBIT))
+-		vruntime += sched_vslice(cfs_rq, se);
++	if (sched_feat(PRESERVE_LAG))
++		vruntime -= se->lag;
+ 
+-	/* sleeps up to a single latency don't count. */
+-	if (!initial) {
+-		unsigned long thresh;
++	if (sched_feat(FAIR_SLEEPERS)) {
++//		u64 sleep_time;
+ 
+-		if (se_is_idle(se))
+-			thresh = sysctl_sched_min_granularity;
+-		else
+-			thresh = sysctl_sched_latency;
++		/* sleeps up to a single latency don't count. */
++		if (!initial) {
++			unsigned long thresh = TICK_NSEC;
++
++			if (!sched_feat(EEVDF)) {
++				if (se_is_idle(se))
++					thresh = sysctl_sched_min_granularity;
++				else
++					thresh = sysctl_sched_latency;
++			}
++
++			/*
++			 * Halve their sleep time's effect, to allow
++			 * for a gentler effect of sleepers:
++			 */
++			if (sched_feat(GENTLE_FAIR_SLEEPERS))
++				thresh >>= 1;
++
++			vruntime -= calc_delta_fair(thresh, se);
++		}
+ 
+ 		/*
+-		 * Halve their sleep time's effect, to allow
+-		 * for a gentler effect of sleepers:
++		 * Pull vruntime of the entity being placed to the base level of
++		 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
++		 * slept for a long time, don't even try to compare its vruntime with
++		 * the base as it may be too far off and the comparison may get
++		 * inversed due to s64 overflow.
++		sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
++		if ((s64)sleep_time < 60LL * NSEC_PER_SEC)
+ 		 */
+-		if (sched_feat(GENTLE_FAIR_SLEEPERS))
+-			thresh >>= 1;
+-
+-		vruntime -= thresh;
++			vruntime = max_vruntime(se->vruntime, vruntime);
+ 	}
+ 
+-	/*
+-	 * Pull vruntime of the entity being placed to the base level of
+-	 * cfs_rq, to prevent boosting it if placed backwards.  If the entity
+-	 * slept for a long time, don't even try to compare its vruntime with
+-	 * the base as it may be too far off and the comparison may get
+-	 * inversed due to s64 overflow.
+-	 */
+-	sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+-	if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+-		se->vruntime = vruntime;
+-	else
+-		se->vruntime = max_vruntime(se->vruntime, vruntime);
++	se->vruntime = vruntime;
++	set_slice(cfs_rq, se);
+ }
+ 
+ static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+@@ -4879,6 +5163,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ 
+ 	clear_buddies(cfs_rq, se);
+ 
++	if (sched_feat(PRESERVE_LAG) && (flags & DEQUEUE_SLEEP))
++		se->lag = avg_vruntime(cfs_rq) - se->vruntime;
++
+ 	if (se != cfs_rq->curr)
+ 		__dequeue_entity(cfs_rq, se);
+ 	se->on_rq = 0;
+@@ -4917,19 +5204,20 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
+ static void
+ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+-	unsigned long ideal_runtime, delta_exec;
++	unsigned long delta_exec;
+ 	struct sched_entity *se;
+ 	s64 delta;
+ 
+-	/*
+-	 * When many tasks blow up the sched_period; it is possible that
+-	 * sched_slice() reports unusually large results (when many tasks are
+-	 * very light for example). Therefore impose a maximum.
+-	 */
+-	ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
++	if (sched_feat(EEVDF)) {
++		if (pick_eevdf(cfs_rq) != curr)
++			goto preempt;
++
++		return;
++	}
+ 
+ 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+-	if (delta_exec > ideal_runtime) {
++	if (delta_exec > curr->slice) {
++preempt:
+ 		resched_curr(rq_of(cfs_rq));
+ 		/*
+ 		 * The current task ran long enough, ensure it doesn't get
+@@ -4953,7 +5241,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ 	if (delta < 0)
+ 		return;
+ 
+-	if (delta > ideal_runtime)
++	if (delta > curr->slice)
+ 		resched_curr(rq_of(cfs_rq));
+ }
+ 
+@@ -5008,17 +5296,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+ static struct sched_entity *
+ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+ {
+-	struct sched_entity *left = __pick_first_entity(cfs_rq);
+-	struct sched_entity *se;
++	struct sched_entity *left, *se;
+ 
+-	/*
+-	 * If curr is set we have to see if its left of the leftmost entity
+-	 * still in the tree, provided there was anything in the tree at all.
+-	 */
+-	if (!left || (curr && entity_before(curr, left)))
+-		left = curr;
++	if (sched_feat(EEVDF)) {
++		/*
++		 * Enabling NEXT_BUDDY will affect latency but not fairness.
++		 */
++		if (sched_feat(NEXT_BUDDY) &&
++		    cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next))
++			return cfs_rq->next;
+ 
+-	se = left; /* ideally we run the leftmost entity */
++		return pick_eevdf(cfs_rq);
++	}
++
++	se = left = pick_cfs(cfs_rq, curr);
+ 
+ 	/*
+ 	 * Avoid running the skip buddy, if running something else can
+@@ -6113,13 +6404,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+ {
+ 	struct sched_entity *se = &p->se;
+-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ 
+ 	SCHED_WARN_ON(task_rq(p) != rq);
+ 
+ 	if (rq->cfs.h_nr_running > 1) {
+-		u64 slice = sched_slice(cfs_rq, se);
+ 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++		u64 slice = se->slice;
+ 		s64 delta = slice - ran;
+ 
+ 		if (delta < 0) {
+@@ -7891,7 +8181,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ 	if (cse_is_idle != pse_is_idle)
+ 		return;
+ 
+-	update_curr(cfs_rq_of(se));
++	cfs_rq = cfs_rq_of(se);
++	update_curr(cfs_rq);
++
++	if (sched_feat(EEVDF)) {
++		/*
++		 * XXX pick_eevdf(cfs_rq) != se ?
++		 */
++		if (pick_eevdf(cfs_rq) == pse)
++			goto preempt;
++
++		return;
++	}
++
+ 	if (wakeup_preempt_entity(se, pse) == 1) {
+ 		/*
+ 		 * Bias pick_next to pick the sched entity that is
+@@ -8137,7 +8439,7 @@ static void yield_task_fair(struct rq *rq)
+ 
+ 	clear_buddies(cfs_rq, se);
+ 
+-	if (curr->policy != SCHED_BATCH) {
++	if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) {
+ 		update_rq_clock(rq);
+ 		/*
+ 		 * Update run-time statistics of the 'current'.
+@@ -8150,6 +8452,8 @@ static void yield_task_fair(struct rq *rq)
+ 		 */
+ 		rq_clock_skip_update(rq);
+ 	}
++	if (sched_feat(EEVDF))
++		se->deadline += calc_delta_fair(se->slice, se);
+ 
+ 	set_skip_buddy(se);
+ }
+@@ -11902,8 +12206,8 @@ static void rq_offline_fair(struct rq *rq)
+ static inline bool
+ __entity_slice_used(struct sched_entity *se, int min_nr_tasks)
+ {
+-	u64 slice = sched_slice(cfs_rq_of(se), se);
+ 	u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
++	u64 slice = se->slice;
+ 
+ 	return (rtime * min_nr_tasks > slice);
+ }
+@@ -12330,6 +12634,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+ 		goto err;
+ 
+ 	tg->shares = NICE_0_LOAD;
++	tg->latency_prio = DEFAULT_LATENCY_PRIO;
+ 
+ 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+ 
+@@ -12428,6 +12733,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+ 	}
+ 
+ 	se->my_q = cfs_rq;
++
++	se->latency_offset = calc_latency_offset(tg->latency_prio);
++
+ 	/* guarantee group entities always have weight */
+ 	update_load_set(&se->load, NICE_0_LOAD);
+ 	se->parent = parent;
+@@ -12558,6 +12866,34 @@ int sched_group_set_idle(struct task_group *tg, long idle)
+ 	return 0;
+ }
+ 
++int sched_group_set_latency(struct task_group *tg, int prio)
++{
++	long latency_offset;
++	int i;
++
++	if (tg == &root_task_group)
++		return -EINVAL;
++
++	mutex_lock(&shares_mutex);
++
++	if (tg->latency_prio == prio) {
++		mutex_unlock(&shares_mutex);
++		return 0;
++	}
++
++	tg->latency_prio = prio;
++	latency_offset = calc_latency_offset(prio);
++
++	for_each_possible_cpu(i) {
++		struct sched_entity *se = tg->se[i];
++
++		WRITE_ONCE(se->latency_offset, latency_offset);
++	}
++
++	mutex_unlock(&shares_mutex);
++	return 0;
++}
++
+ #else /* CONFIG_FAIR_GROUP_SCHED */
+ 
+ void free_fair_sched_group(struct task_group *tg) { }
+@@ -12584,7 +12920,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
+ 	 * idle runqueue:
+ 	 */
+ 	if (rq->cfs.load.weight)
+-		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
++		rr_interval = NS_TO_JIFFIES(se->slice);
+ 
+ 	return rr_interval;
+ }
+diff --git a/kernel/sched/features.h b/kernel/sched/features.h
+index efdc29c42161..49c7e6fa4c71 100644
+--- a/kernel/sched/features.h
++++ b/kernel/sched/features.h
+@@ -1,16 +1,18 @@
+ /* SPDX-License-Identifier: GPL-2.0 */
++
+ /*
+  * Only give sleepers 50% of their service deficit. This allows
+  * them to run sooner, but does not allow tons of sleepers to
+  * rip the spread apart.
+  */
++SCHED_FEAT(FAIR_SLEEPERS, false)
+ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
+ 
+ /*
+- * Place new tasks ahead so that they do not starve already running
+- * tasks
++ * Using the avg_vruntime, do the right thing and preserve lag
++ * across sleep+wake cycles.
+  */
+-SCHED_FEAT(START_DEBIT, true)
++SCHED_FEAT(PRESERVE_LAG, true)
+ 
+ /*
+  * Prefer to schedule the task we woke last (assuming it failed
+@@ -102,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false)
+ 
+ SCHED_FEAT(ALT_PERIOD, true)
+ SCHED_FEAT(BASE_SLICE, true)
++
++SCHED_FEAT(EEVDF, true)
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 9e8bb6278604..fe5af7aaa931 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -378,6 +378,8 @@ struct task_group {
+ 
+ 	/* A positive value indicates that this is a SCHED_IDLE group. */
+ 	int			idle;
++	/* latency priority of the group. */
++	int			latency_prio;
+ 
+ #ifdef	CONFIG_SMP
+ 	/*
+@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+ 
+ extern int sched_group_set_idle(struct task_group *tg, long idle);
+ 
++extern int sched_group_set_latency(struct task_group *tg, int prio);
++
+ #ifdef CONFIG_SMP
+ extern void set_task_rq_fair(struct sched_entity *se,
+ 			     struct cfs_rq *prev, struct cfs_rq *next);
+@@ -554,6 +558,9 @@ struct cfs_rq {
+ 	unsigned int		idle_nr_running;   /* SCHED_IDLE */
+ 	unsigned int		idle_h_nr_running; /* SCHED_IDLE */
+ 
++	s64			avg_vruntime;
++	u64			avg_load;
++
+ 	u64			exec_clock;
+ 	u64			min_vruntime;
+ #ifdef CONFIG_SCHED_CORE
+@@ -2478,6 +2485,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
+ extern unsigned int sysctl_numa_balancing_hot_threshold;
+ #endif
+ 
++extern long calc_latency_offset(int prio);
++
+ #ifdef CONFIG_SCHED_HRTICK
+ 
+ /*
+@@ -3251,4 +3260,7 @@ static inline void update_current_exec_runtime(struct task_struct *curr,
+ 	cgroup_account_cputime(curr, delta_exec);
+ }
+ 
++extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
++extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
++
+ #endif /* _KERNEL_SCHED_SCHED_H */
+diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h
+index 3bac0a8ceab2..b2e932c25be6 100644
+--- a/tools/include/uapi/linux/sched.h
++++ b/tools/include/uapi/linux/sched.h
+@@ -132,6 +132,7 @@ struct clone_args {
+ #define SCHED_FLAG_KEEP_PARAMS		0x10
+ #define SCHED_FLAG_UTIL_CLAMP_MIN	0x20
+ #define SCHED_FLAG_UTIL_CLAMP_MAX	0x40
++#define SCHED_FLAG_LATENCY_NICE		0x80
+ 
+ #define SCHED_FLAG_KEEP_ALL	(SCHED_FLAG_KEEP_POLICY | \
+ 				 SCHED_FLAG_KEEP_PARAMS)
+@@ -143,6 +144,7 @@ struct clone_args {
+ 			 SCHED_FLAG_RECLAIM		| \
+ 			 SCHED_FLAG_DL_OVERRUN		| \
+ 			 SCHED_FLAG_KEEP_ALL		| \
+-			 SCHED_FLAG_UTIL_CLAMP)
++			 SCHED_FLAG_UTIL_CLAMP		| \
++			 SCHED_FLAG_LATENCY_NICE)
+ 
+ #endif /* _UAPI_LINUX_SCHED_H */
+-- 
+2.40.0.rc2
diff --git a/patches/0004-hdr.patch b/patches/0005-hdr.patch
similarity index 100%
rename from patches/0004-hdr.patch
rename to patches/0005-hdr.patch
diff --git a/scripts/patch.sh b/scripts/patch.sh
index 0948c18..fb03347 100755
--- a/scripts/patch.sh
+++ b/scripts/patch.sh
@@ -7,10 +7,12 @@ echo "Pika Kernel - Applying patches"
 patch -Np1 < "../patches/0001-cachy-all.patch"
 # orig patch from cachy - 0001-Add-latency-priority-for-CFS-class.patch
 patch -Np1 < "../patches/0002-cfs-nice.patch"
-# orig patch from cachy - 0001-bore-cachy.patch
-patch -Np1 < "../patches/0003-bore.patch"
+# orig patch from cachy
+patch -Np1 < "../patches/0003-eevdf.patch"
+# orig patch from cachy - 0001-bore-eevdf.patch
+patch -Np1 < "../patches/0004-bore.patch"
 # HDR patch - from cachy (but they deleted it)
-patch -Np1 < "../patches/0004-hdr.patch"
+patch -Np1 < "../patches/0005-hdr.patch"
 # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork
 # Extra Leigon laptop goodies
 patch -Np1 < "../patches/0001-Add-legion-laptop-v0.1.patch"
diff --git a/scripts/source.sh b/scripts/source.sh
index d25ac4f..493c981 100755
--- a/scripts/source.sh
+++ b/scripts/source.sh
@@ -2,7 +2,7 @@
 
 echo "Pika Kernel - Getting source"
 
-wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.1.tar.gz
-tar -zxf ./linux-6.2.1.tar.gz
+wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.5.tar.gz
+tar -zxf ./linux-6.2.5.tar.gz
 
-cd linux-6.2.1
\ No newline at end of file
+cd linux-6.2.5
\ No newline at end of file